# inkme_datascraping

We will start the project by scraping the tattoo images from a repository in order to create our dataset. We will do this using Selenium in the Tattoodo website. Similar will be done for the tattoo artist database.

## Tattoo image repository

In [1]:
import pandas as pd
import numpy as np

# This is the library that we will use to create break times in order to mimic human behaviour 
import time

# These are the classes we will use for interaction with a webpage
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# This library is used for interacting with the operating system
import os

Using Selenium with this website was trickier than I initially thought, so I've collected the URLs needed for each style separately.

### Watercolor

In [52]:
# Initiate the driver
driver = webdriver.Chrome(ChromeDriverManager().install())

# Navigate to the website
url = "https://www.tattoodo.com/tattoos?q=watercolor%20tattoo"
driver.get(url)

images = set()

# Scroll down the page in fixed intervals
while True:
    # Calculate the remaining height of the page
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
    remaining_height = total_height - current_height
    
    # Scrape the images
    image_tags = driver.find_elements_by_tag_name("img")
    for tag in image_tags:
        images.add(tag.get_attribute("src"))

    # If there is no remaining height, exit the loop
    if remaining_height == 0:
        break

    # Calculate the distance to scroll down
    scroll_distance = remaining_height // 4

    # Scroll down by the calculated distance
    driver.execute_script(f"window.scrollBy(0, {scroll_distance})")

    # Wait for new images to load
    time.sleep(4)

# Quit the driver
driver.quit()

# Check length of the images URLs
len(images)

2003

In [None]:
# Putting everything in a dataframe and exporing a .csv with all the URLs

image_dict = {'image_url': list(images)}
watercolor = pd.DataFrame(image_dict)
watercolor.to_csv('watercolor.csv', index=False)

### Realism

In [66]:
# Navigate to the website
url = "https://www.tattoodo.com/tattoos?style=realism"
driver.get(url)

realism = set()

# Scroll down the page in fixed intervals
while True:
    # Calculate the remaining height of the page
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
    remaining_height = total_height - current_height
    
    # Scrape the images
    image_tags = driver.find_elements_by_tag_name("img")
    for tag in image_tags:
        realism.add(tag.get_attribute("src"))

    # If there is no remaining height, exit the loop
    if remaining_height == 0:
        break

    # Calculate the distance to scroll down
    scroll_distance = remaining_height // 4

    # Scroll down by the calculated distance
    driver.execute_script(f"window.scrollBy(0, {scroll_distance})")

    # Wait for new images to load
    time.sleep(4)

# Quit the driver
driver.quit()

# Check length of the images URLs
len(realism)

1917

In [None]:
# Putting everything in a dataframe and exporing a .csv with all the URLs

image_dict = {'image_url': list(realism)}
realism = pd.DataFrame(image_dict)
realism.to_csv('realism.csv', index=False)

### Blackwork

In [3]:
## Navigate to the website

url = "https://www.tattoodo.com/tattoos?style=blackwork"
driver.get(url)

blackwork = set()

# Scroll down the page in fixed intervals
while True:
    # Calculate the remaining height of the page
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
    remaining_height = total_height - current_height
    
    # Scrape the images
    image_tags = driver.find_elements_by_tag_name("img")
    for tag in image_tags:
        blackwork.add(tag.get_attribute("src"))

    # If there is no remaining height, exit the loop
    if remaining_height == 0:
        break

    # Calculate the distance to scroll down
    scroll_distance = remaining_height // 4

    # Scroll down by the calculated distance
    driver.execute_script(f"window.scrollBy(0, {scroll_distance})")

    # Wait for new images to load
    time.sleep(4)

# Quit the driver
driver.quit()

# Check length of the images URLs
len(blackwork)

1912

In [None]:
# Putting everything in a dataframe and exporing a .csv with all the URLs

image_dict = {'image_url': list(blackwork)}
blackwork = pd.DataFrame(image_dict)
blackwork.to_csv('blackwork.csv', index=False)

### Fineline

In [10]:
# Navigate to the website
url = "https://www.tattoodo.com/tattoos?style=fine_line"
driver.get(url)

fineline = set()

# Scroll down the page in fixed intervals
while True:
    # Calculate the remaining height of the page
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
    remaining_height = total_height - current_height
    
    # Scrape the images
    image_tags = driver.find_elements_by_tag_name("img")
    for tag in image_tags:
        fineline.add(tag.get_attribute("src"))

    # If there is no remaining height, exit the loop
    if remaining_height == 0:
        break

    # Calculate the distance to scroll down
    scroll_distance = remaining_height // 4

    # Scroll down by the calculated distance
    driver.execute_script(f"window.scrollBy(0, {scroll_distance})")

    # Wait for new images to load
    time.sleep(4)

# Quit the driver
driver.quit()

# Check length of the images URLs
len(fineline)

1926

In [None]:
# Putting everything in a dataframe and exporing a .csv with all the URLs

image_dict = {'image_url': list(fineline)}
fineline = pd.DataFrame(image_dict)
fineline.to_csv('fineline.csv', index=False)

## Tattoo artist database

The tattoo artist database was obtained in a similar scraping process in the website Tattoos Wizard. Below I'll work with the .csv already assembled.

In [None]:
# Bringing the .csv with the tattoo artists and previewing it

artists = pd.read_csv("/Users/caionunez/Desktop/tattooartistdbvfinal.csv", sep=";")
artists.head()

In [None]:
# Checking for null values

artists.isnull().sum()

In [None]:
# Doing some minor data prepraration and cleaning

# Changing column Tag1 to Style
artists["Style"] = artists["Tag 1"]
artists.drop(columns = ["Tag 1"], inplace = True)

# Replacing "black&gray" with "blackwork" in the Style column
artists['Style'] = artists['Style'].str.replace('black&gray', 'blackwork')

# Replacing "black&gray" in multiple styles
artists['Style'] = artists['Style'].apply(lambda x: ', '.join(['blackwork' if s.strip() == 'black&gray' else s.strip() for s in x.split(',')]))

# Replacing "watercolour" per "watercolor" in multiple styles
artists['Style'] = artists['Style'].apply(lambda x: ', '.join(['watercolor' if s.strip() == 'watercolour' else s.strip() for s in x.split(',')]))

# Capitalizing style names in Style column
def capitalize_style(row):
    styles = row['Style'].split(', ')
    capitalized_styles = [s.capitalize() for s in styles]
    return ', '.join(capitalized_styles)

artists['Style'] = artists.apply(capitalize_style, axis=1)

# Exporting the csv
artists.to_csv("artist_dbfinal.csv")

In [None]:
# Creating function that will be used for presenting tattoo recommendations later on

def get_artists(style, df):
    style_mask = df['Style'].str.contains(style)
    style_df = df[style_mask].sample(n=3)
    return print(style_df.values)