# 1. DATA COLLECTION

## Import libraries

In [4]:
import requests
import json
import os
import urllib
import sys

## Define functions/classes

In [5]:
class Config:
    IMAGE_SEARCH_URL = "https://pinterest.com/resource/BaseSearchResource/get/?"
    def __init__(self, search_keywords="", file_lengths=10, image_quality="orig", bookmarks=""):
        self.search_keywords = search_keywords
        self.file_lengths = file_lengths
        self.image_quality = image_quality
        self.bookmarks = bookmarks

    #image search url
    @property
    def search_url(self):
        return self.IMAGE_SEARCH_URL

    #search parameter "source_url"
    @property
    def source_url(self):
        # Mã hóa từ khóa có dấu (q)
        search_query = urllib.parse.quote(self.search_keyword)
        # Tạo từ khóa không dấu (eq)
        search_query_eq = urllib.parse.quote(self.remove_accents(self.search_keyword))
        
        return f"/search/pins/?rs=ac&len=2&q={search_query}&eq={search_query_eq}&etslf=1543"
    
    def remove_accents(self, text):
        """
        Hàm này sẽ xóa dấu tiếng Việt để tạo ra từ khóa không dấu (eq).
        """
        vietnamese_accents = {
            'á': 'a', 'à': 'a', 'ả': 'a', 'ã': 'a', 'ạ': 'a', 'ă': 'a', 'ắ': 'a', 'ằ': 'a', 'ẳ': 'a', 'ẵ': 'a', 'ặ': 'a',
            'â': 'a', 'ấ': 'a', 'ầ': 'a', 'ẩ': 'a', 'ẫ': 'a', 'ậ': 'a', 'é': 'e', 'è': 'e', 'ẻ': 'e', 'ẽ': 'e', 'ẹ': 'e',
            'ê': 'e', 'ế': 'e', 'ề': 'e', 'ể': 'e', 'ễ': 'e', 'ệ': 'e', 'í': 'i', 'ì': 'i', 'ỉ': 'i', 'ĩ': 'i', 'ị': 'i',
            'ó': 'o', 'ò': 'o', 'ỏ': 'o', 'õ': 'o', 'ọ': 'o', 'ô': 'o', 'ố': 'o', 'ồ': 'o', 'ổ': 'o', 'ỗ': 'o', 'ộ': 'o',
            'ơ': 'o', 'ớ': 'o', 'ờ': 'o', 'ở': 'o', 'ỡ': 'o', 'ợ': 'o', 'ú': 'u', 'ù': 'u', 'ủ': 'u', 'ũ': 'u', 'ụ': 'u',
            'ư': 'u', 'ứ': 'u', 'ừ': 'u', 'ử': 'u', 'ữ': 'u', 'ự': 'u', 'ý': 'y', 'ỳ': 'y', 'ỷ': 'y', 'ỹ': 'y', 'ỵ': 'y',
            'đ': 'd'
        }
        
        return ''.join(vietnamese_accents.get(c, c) for c in text)
    
    #search parameter "data"
    @property
    def image_data(self):        
        if self.bookmarks == "":
            return '''{"options":{"isPrefetch":false,"query":"''' + self.search_keyword + '''","scope":"pins","no_fetch_context_on_resource":false},"context":{}}'''
        else:
            return '''{"options":{"page_size":25,"query":"''' + self.search_keyword + '''","scope":"pins","bookmarks":["''' + self.bookmark + '''"],"field_set_key":"unauth_react","no_fetch_context_on_resource":false},"context":{}}'''.strip()

    @property
    def search_keyword(self):
        return self.search_keywords
    
    @search_keyword.setter
    def search_keyword(self, search_keywords):
        self.search_keywords = search_keywords
    
    @property
    def file_length(self):
        return self.file_lengths
    
    @file_length.setter
    def file_length(self, file_lengths):
        self.file_lengths = file_lengths

    @property
    def image_quality(self):
        return self.image_qualitys
    
    @image_quality.setter
    def image_quality(self, image_qualitys):
        self.image_qualitys = image_qualitys

    @property
    def bookmark(self):
        return self.bookmarks

    @bookmark.setter
    def bookmark(self, bookmarks):
        self.bookmarks = bookmarks
    

In [None]:
class Scraper:
    def __init__(self, config, image_urls=[], save_name=""):
        self.config = config
        self.image_urls = image_urls
        self.save_name = save_name

    # Set config for bookmarks (next page)
    def setConfig(self, config):
        self.config = config

    # Download images
    def download_images(self):
        results = self.get_urls()
        if results == None:
            print("No image found")
            return
        folder = "../data/raw_photos"
        number = 0
        # prev get links
        try:
            os.makedirs(folder)
            print("Directory ", folder, " Created ")
        except FileExistsError:
            a = 1
        arr = os.listdir(folder+"/")
        for i in results:
            if str(i + ".jpg") not in arr:
                try:
                    ext = "jpg" if "jpg" in i else "png"
                    file_name = f"{self.save_name}_{number + 1}.{ext}"
                    download_folder = str(folder) + "/" + file_name
                    urllib.request.urlretrieve(i,  download_folder)
                    number = number + 1
                except Exception as e:
                    print(e)

    # get_urls return array
    def get_urls(self):
        SOURCE_URL = self.config.source_url,
        DATA = self.config.image_data,
        URL_CONSTANT = self.config.search_url
        r = requests.get(URL_CONSTANT, params={
                         "source_url": SOURCE_URL, "data": DATA})

        jsonData = json.loads(r.content)
        if "resource_response" not in jsonData:
            return self.image_urls[0:self.config.file_length]
        resource_response = jsonData["resource_response"]
        data = resource_response["data"]
        results = data["results"]
        for i in results:
            url = i["images"][self.config.image_quality]["url"]
            self.image_urls.append(url)

        if len(self.image_urls) < int(self.config.file_length):
            if "bookmark" in resource_response:
                self.config.bookmarks = resource_response["bookmark"]
                # print(self.image_urls)
                self.get_urls()
                return self.image_urls[0:self.config.file_length]
        else:
            return self.image_urls[0:self.config.file_length]
        #if len(str(resource_response["bookmark"])) > 1 : connect(query_string, bookmarks=resource_response["bookmark"])

In [None]:
# Remove images      
def remove_images(folder):
    try:
        os.system(f"rm -rf {folder}")
    except Exception as e:
        print(e)

In [36]:
def crawler(sport, file_lengths, image_quality):
    search_keywords = sport + " action in real life"
    config = Config(search_keywords, file_lengths, image_quality)
    scraper = Scraper(config, [], sport)
    print("Downloading images for ", sport)
    scraper.download_images()
    print(f" >>> Finished downloading")

## Test

In [None]:
# TEST
# try with keyword "Archery"
config = Config(search_keywords="archery action real life shot", file_lengths=10, image_quality="orig")
scraper = Scraper(config, [], "Archery")
print(scraper.get_urls())
scraper.download_images()
print("Done")

['https://i.pinimg.com/originals/6b/d6/f8/6bd6f8dbb062ba4a7343d04970a946bb.jpg', 'https://i.pinimg.com/originals/07/e9/53/07e9538c28eeaa96e63a8b0efce021ab.jpg', 'https://i.pinimg.com/originals/8d/f9/49/8df949378a06cd61298a3f19257d4262.jpg', 'https://i.pinimg.com/originals/0e/4d/9f/0e4d9f33f14017621dfa4ce427a38610.jpg', 'https://i.pinimg.com/originals/3e/e5/a0/3ee5a0fc7efefafff90f99bacb08e6fc.jpg', 'https://i.pinimg.com/originals/78/5f/fa/785ffa56b42b2678cbfe83d04de09c18.jpg', 'https://i.pinimg.com/originals/a9/20/f7/a920f79c3da0c1540471fba6826d8d48.jpg', 'https://i.pinimg.com/originals/15/7c/40/157c40004e20bf57c31d167228903e9f.jpg', 'https://i.pinimg.com/originals/7c/2e/aa/7c2eaa45b1363b62a4ac1822e3afc7df.jpg', 'https://i.pinimg.com/originals/f7/98/8c/f7988c9e30e99f19ce9a0b17b720e099.jpg']
../data/raw_photos
Directory  ../data/raw_photos  Created 
Done


## Main

In [27]:
with open("../data/metadata/sports_cate.txt", "r", encoding="utf-8") as file:
    sports_list = [line.strip() for line in file.readlines()]
    
# Liệt kê thử vài môn thể thao
sports_list[:5]

['Soccer', 'Basketball', 'Tennis', 'Athletics', 'Swimming']

In [None]:
remove_images("../data/raw_photos")

In [42]:
for sport in sports_list:
    # check if there's sport_40.jpg or .png in the folder or not
    if os.path.exists(f"../data/raw_photos/{sport}_38.jpg") or os.path.exists(f"../data/raw_photos/{sport}_38.png"):
        continue
    crawler(sport, 40, "orig")


Downloading images for  Soccer
Directory  ../data/raw_photos  Created 
 >>> Finished downloading
Downloading images for  Basketball
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>
 >>> Finished downloading
Downloading images for  Tennis
 >>> Finished downloading
Downloading images for  Athletics
 >>> Finished downloading
Downloading images for  Swimming
 >>> Finished downloading
Downloading images for  Gymnastics
 >>> Finished downloading
Downloading images for  Volleyball
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, o