In [None]:
import requests
from bs4 import BeautifulSoup
import os
import urllib.parse
from google.colab import drive
import base64
import re
import csv
import urllib.parse

In [None]:
# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
def download_image(url, directory, filename):
    try:
        if url.startswith('data:image'):
            # Handle base64-encoded images
            # Extract the base64-encoded image data
            image_data = url.split(';base64,')[-1]
            # Decode base64 data
            image_bytes = base64.b64decode(image_data)
            with open(os.path.join(directory, filename), 'wb') as f:
                f.write(image_bytes)
        else:
            # Handle regular image URLs
            response = requests.get(url)
            if response.status_code == 200:
                with open(os.path.join(directory, filename), 'wb') as f:
                    f.write(response.content)
            else:
                print(f"Failed to download: {filename}")
    except Exception as e:
        print(f"An error occurred while downloading {filename}: {str(e)}")

In [None]:
def crawl_unsplash(query, directory):
    query_url =  re.sub(r'\s', '%20', query)
    query_url = urllib.parse.quote_plus(query_url)
    url = f"https://unsplash.com/s/photos/{query_url}?license=free"

    # List to store image data
    image_data = []
    name = query

    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find all div tags with class MorZF
            divs = soup.find_all('div', class_='MorZF')
            for div in divs:
                # Initialize dictionary to store scraped data
                scraped_data = {}
                # Find the parent anchor tag of the div
                anchor_tag = div.find_parent('a', itemprop='contentUrl')
                if anchor_tag:
                    # Extract title and href attributes from the parent anchor tag
                    title = anchor_tag.get('title', '')
                    href = anchor_tag.get('href', '')
                    # Find all img tags within the div
                    images = div.find_all('img')
                    for idx, image in enumerate(images):
                        # Check if img tag has alt, srcset, and src attributes
                        if 'alt' in image.attrs and 'srcset' in image.attrs and 'src' in image.attrs:
                            # Extract the image URL and alt text
                            image_url = image['src']
                            alt_text = image['alt']
                            # Update scraped data
                            scraped_data.update({'Alt': alt_text, 'URL': image_url, 'Title': title, 'Anchor_Href': href})
                        # Check if the img tag has a 'data-srcset' attribute for lazy loading
                        elif 'data-srcset' in image.attrs:
                            # Extract the lazy-loaded image URL
                            srcset = image['data-srcset']
                            # Use regular expression to extract the first image URL from srcset
                            match = re.search(r'(https:\/\/.*?)\s', srcset)
                            if match:
                                image_url = match.group(1)
                                # Extract alt text if available
                                alt_text = image.attrs.get('alt', '')
                                # Update scraped data
                                scraped_data.update({'Alt': alt_text, 'URL': image_url, 'Title': title, 'Anchor_Href': href})
                    # Visit the individual image page
                    image_page_url = f"https://unsplash.com{href}"
                    image_page_response = requests.get(image_page_url)
                    if image_page_response.status_code == 200:
                        image_page_soup = BeautifulSoup(image_page_response.content, 'html.parser')
                        # Find the div with class "VZRk3 rLPoM" on the individual image page
                        related_images_div = image_page_soup.find('div', class_='VZRk3 rLPoM')
                        if related_images_div:
                            # Find all anchor tags within the div
                            related_image_links = related_images_div.find_all('a')
                            # Extract inner text and href of related images
                            related_images = [link.get_text() for link in related_image_links]
                            # Update scraped data and related images to list
                            scraped_data.update({'Relative_Title': title, 'Relative_URL': href, 'Related_Images': related_images})
                # Append scraped data to image data list
                image_data.append(scraped_data)
            image_data = [d for d in image_data if d]
            # Write image data to a CSV file
            csv_file_path = os.path.join(directory, f"{name}_image_data.csv")
            with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['Alt', 'URL', 'Title', 'Anchor_Href', 'Relative_Title', 'Relative_URL', 'Related_Images']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                for item in image_data:
                    writer.writerow(item)
            print(f"Image data saved to: {csv_file_path}")
        else:
            print(f"Failed to fetch images for query: {name}")
    except Exception as e:
        print(f"An error occurred while crawling Unsplash: {str(e)}")

    print("Done.")

In [None]:
def generate_data(query_term):
    if not os.path.exists("/content/drive/"):
        print("Google Drive is not mounted. Please mount Google Drive.")
    else:
        query = query_term
        directory = "/content/drive/MyDrive/MOS_2/Data_v3"
        if not os.path.exists(directory):
            os.makedirs(directory)
        crawl_unsplash(query, directory)

In [None]:
queries = [
    "Funny cats",
    "Abstract paintings",
    "Vintage cars",
    "Cute puppies",
    "City skyline",
    "Mountain landscape",
    "Famous landmarks",
    "Healthy recipes",
    "Space exploration",
    "Wedding dresses",
    "Funny cartoons",
    "Home decor ideas",
    "Wildlife photography",
    "Fashion trends",
    "Historical architecture",
    "DIY projects",
    "Fitness inspiration",
    "Technology gadgets",
    "Travel destinations",
    "Cat",
    "Flower",
    "Mountain",
    "Chair",
    "Car",
    "Running horse",
    "Playing basketball",
    "Person swimming",
    "Building a snowman",
    "Dancing people",
    "Eiffel Tower",
    "Statue of Liberty",
    "Great Wall of China",
    "Taj Mahal",
    "Colosseum",
    "Sunset",
    "Rainbow",
    "Waterfall",
    "Snowflakes",
    "Forest",
    "Black cat with white paws",
    "Red roses in a vase",
    "Vintage car on a beach",
    "Snowy mountain peak",
    "Baby panda playing",
    "Golden Gate Bridge",
    "Northern Lights",
    "Milky Way galaxy",
    "Grand Canyon",
    "Pyramids of Giza",
    "Sailing boats",
    "Cityscape at night",
    "Autumn leaves",
    "Underwater world",
    "Street art",
    "Morning dew",
    "Rural landscape",
    "Abstract geometric patterns",
    "Vintage bicycles",
    "Children playing in the park",
]

In [None]:
print(len(queries))

59


In [None]:
for query in queries:
    print(f"Generating data for: {query}")
    query_term = query.lower()
    generate_data(query_term)

In [None]:
additional_queries = [
    "Beach vacation",
    "Family picnic",
    "Modern art",
    "Luxury cars",
    "Adorable kittens",
    "Metropolitan skyline",
    "Alpine scenery",
    "World heritage sites",
    "Vegan recipes",
    "Astronauts in space",
    "Bridal gowns",
    "Satirical comics",
    "Contemporary interior design",
    "Endangered species",
    "Runway fashion",
    "Ancient ruins",
    "Woodworking projects",
    "Yoga poses",
    "Latest gadgets",
    "Exotic travel destinations",
    "Artistic cats",
    "Colorful flowers",
    "Snow-capped peaks",
    "Antique furniture",
    "Horseback riding",
    "NBA games",
    "Swimming competitions",
    "Winter wonderland",
    "Street performers",
    "Historical monuments",
    "Beach scene",
    "Family gathering",
    "Abstract artwork",
    "Classic cars",
    "Cute animals",
    "Cityscape",
    "Mountain view",
    "Landmark",
    "Healthy food",
    "Outer space",
    "Wedding",
    "Cartoon illustration",
    "Home decoration",
    "Wildlife",
    "Fashion style",
    "Ancient architecture",
    "Craft projects",
    "Fitness exercise",
    "New technology",
    "Travel",
    "Pets",
    "Floral arrangement",
    "Nature landscape",
    "Furniture design",
    "Animal running",
    "Sports game",
    "Swimming pool",
    "Winter scenery",
    "Street art",
    "Historical site"
]

In [None]:
for query in additional_queries:
    print(f"Generating data for: {query}")
    query_term = query.lower()
    generate_data(query_term)

In [None]:
more_queries = [
    "rock",
    "pop",
    "hip hop",
    "rap",
    "electronic music",
    "EDM",
    "techno",
    "house music",
    "ambient music",
    "world music",
    "folk music",
    "country music",
    "reggae",
    "ska",
    "punk",
    "metal",
    "indie music",
    "alternative music",
    "experimental music",
    "film genres",
    "movie reviews",
    "cinematography",
    "film festivals",
    "movie soundtracks",
    "film directors",
    "actors",
    "actresses",
    "film awards",
    "Oscars",
    "Golden Globes",
    "Cannes Film Festival",
    "movie franchises",
    "superhero movies",
    "animated movies",
    "documentary films",
    "foreign films",
    "independent films",
    "cult films",
    "film noir",
    "horror movies",
    "thriller movies",
    "action movies",
    "adventure movies",
    "sci-fi movies",
    "fantasy movies",
    "romantic movies",
    "comedy movies",
    "drama movies",
    "biographical movies",
    "historical movies"
]

In [None]:
for query in more_queries:
    print(f"Generating data for: {query}")
    query_term = query.lower()
    generate_data(query_term)