# Polyvore Crawler for Storing to DB

In [1]:
%matplotlib inline
import matplotlib
from matplotlib import pyplot as plt
import os
import time
import cv2
import json
import requests
from requests.exceptions import *
import psycopg2
import shutil
from PIL import Image
from lxml import html
from bs4 import BeautifulSoup
from datetime import date, datetime
import numpy as np
from numpy import *
from pylab import *
import pandas
from pandas import Series, DataFrame
import multiprocessing

In [2]:
# Connect to postgres database
conn = psycopg2.connect(database="fashion", user="michellechen", password="", host="127.0.0.1", port="5432")

print "Opened database successfully"

# Connection cursor to database
cur = conn.cursor()

# Create outfit table
cur.execute('''CREATE TABLE IF NOT EXISTS outfits(
                outfit_id INT PRIMARY KEY NOT NULL,
                link TEXT NOT NULL,
                title TEXT,
                image TEXT,
                author TEXT,
                create_date TEXT,
                views INT,
                likes INT,
                description TEXT,
                similar1 TEXT,
                similar2 TEXT,
                similar3 TEXT,
                similar4 TEXT,
                similar5 TEXT
                )''')

# Create outfit_item table
cur.execute('''CREATE TABLE IF NOT EXISTS outfit_item(
                id TEXT PRIMARY KEY NOT NULL,
                outfit_id INT,
                item_id INT
                )''')

# Create outfit item table
cur.execute('''CREATE TABLE IF NOT EXISTS items(
                item_id INT PRIMARY KEY NOT NULL,
                link TEXT NOT NULL,
                title TEXT,
                image TEXT,
                category1_id INT,
                category1_name TEXT,
                category2_id INT,
                category2_name TEXT,
                category3_id INT,
                category3_name TEXT,
                category4_id INT,
                category4_name TEXT,
                brand TEXT,
                price TEXT,
                likes INT,
                description TEXT,
                keyword1 TEXT,
                keyword2 TEXT,
                keyword3 TEXT,
                keyword4 TEXT,
                keyword5 TEXT,
                keyword6 TEXT
                )''')

conn.commit() 

print "Table created successfully"

Opened database successfully
Table created successfully


In [3]:
# Headers
head = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}

In [4]:
# Crawl item data with item link
def crawl_item(day, outfit_id, line):
    
    # Initialize
    item_id = 0
    item_link = ""
    item_title = ""
    item_image = ""
    item_category_id = [0, 0, 0, 0]
    item_category_name = ["", "", "", ""]
    item_brand = ""
    item_price = ""
    item_likes = 0
    item_description = ""
    item_keywords = ["", "", "", "", "", ""]
    
    # Item id
    item_id = int(line.split('\n')[0].split('=')[-1])
    #print "item id: " + str(item_id)

    # Item link
    item_link = line.split('context_id')[0] + "id=" + str(item_id)
    print "item link: " + item_link
    
    # Get request response
    while True:
        try:
            res = requests.get(item_link, headers = head)
        except RequestException:
            print "Item Request Connection Error!"
            print "Delay 1 minutes!"
            time.sleep(60) # Delay 60 seconds
        else:
            break
    
    # If Item page is not found
    if res.status_code == 404:
        print "HTTP Response Error " + str(res.status_code)
        print "Item Not Found!"
        return 0
    
    # When http response error
    while res.status_code != 200:
        print "HTTP Response Error " + str(res.status_code)
        time.sleep(3) # Delay 3 seconds
        res = requests.get(item_link, headers = head)
        if res.status_code == 200:
            break

    # Parse html with BeautifulSoup
    soup = BeautifulSoup(res.text, "html.parser")
    
    # Item category
    try:
        category = int(soup.select('.crumb')[0].select('a')[0]['href'].split('=')[-1])
    except ValueError: # Not in category
        print "Item Doesn't Have Category ID!"
        return 0
    else:
        if category != 1: # Not in category "Women's Fashion" 
            print "Item Not In Category Women's Fashion!"
            return 0
        else:
            for index, item in enumerate(soup.select('.crumb')):
                try:
                    item_category_id[index] = int(item.select('a')[0]['href'].split('=')[-1])
                    #print str(item_category_id[index])
                except ValueError:
                    pass
                
                item_category_name[index] = item.select('a')[0].select('span')[0].text       
                #print item_category_name[index]

                # Item brand
                if 'brand=' in item.select('a')[0]['href']:
                    item_brand = item.select('a')[0]['href'].split('brand=')[-1].split('&')[0]
                    #print "item brand: " + item_brand
                    
            # Item title
            item_title = soup.find_all('meta', {'property': 'og:title'})[0]['content']
            #print "item title: " + item_title

            # Item image
            item_image = soup.find_all('meta', {'property': 'og:image'})[0]['content']
            #print "item image: " + item_image
            
            # Save Item image as a jpg file
            while True:
                try: 
                    image_res = requests.get(item_image, stream = True)
                except ConnectionError:
                    print "Image Request Connection Error!"
                    time.sleep(3) # Delay 3 seconds
                else:
                    break
                
            f = open('item_images/' + day + '/' + day + '-' + str(outfit_id) + '-'+ str(item_id) + '.jpg', 'wb')
            shutil.copyfileobj(image_res.raw, f)
            f.close

            # Item price
            try: 
                item_price = soup.select('div.bd')[0].select('.price')[0].text
                #print "item price: " + item_price 
            except IndexError:
                try:
                    item_price = soup.select('div.bd')[0].select('.orig_price')[0].text
                    #print "item price: " + item_price 
                except IndexError:
                    pass
           
            # Item likes
            item_likes = int(soup.find_all('meta', {'property': 'polyvore:saves'})[0]['content'])
            #print "item likes: " + str(item_likes)

            # Item description
            try:
                item_description = soup.select('.tease')[0].text
                #print "item description: " + item_description
            except IndexError:
                pass

            # Item keyword 
            try:
                soup.select('#related_searches')[0].select('a')
            except IndexError:
                pass
            else:
                for index, keyword in enumerate(soup.select('#related_searches')[0].select('a')):
                    if index >= 6:
                        break
                    item_keywords[index] = keyword.text
            #print "item keywords: " + item_keywords
            
           
        
            ############################# Insert item data into database ###############################
            
            # Insert item data into items table
            query = '''INSERT INTO items
                        (item_id, link, title, image, category1_id, category1_name, category2_id, category2_name,
                        category3_id, category3_name, category4_id, category4_name, brand, price, likes,
                        description, keyword1, keyword2, keyword3, keyword4, keyword5, keyword6) 
                        VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'''
            data = (item_id, item_link, item_title, item_image, item_category_id[0], item_category_name[0], 
                    item_category_id[1], item_category_name[1], item_category_id[2], item_category_name[2],
                    item_category_id[3], item_category_name[3], item_brand, item_price, item_likes, item_description, 
                    item_keywords[0], item_keywords[1], item_keywords[2], item_keywords[3], item_keywords[4], 
                    item_keywords[5])

            try:
                cur.execute(query, data)
            except psycopg2.Error, e:
                print "Item pass!"
                print e.pgerror
                pass

            conn.commit()
            
            
            ########################## Insert outfit-item data into database ###########################

            # Insert outfit-item pairing data into outfit-item table
            pairing_query =  '''INSERT INTO outfit_item (id, outfit_id, item_id) 
                                VALUES(%s, %s, %s);'''
            pairing_data = (str(outfit_id) + "-" + str(item_id), outfit_id, item_id)

            try:
                cur.execute(pairing_query, pairing_data)
            except psycopg2.Error, e:
                print "Outfit-Item pass!"
                print e.pgerror
                pass

            conn.commit()
           
    return 1
            

In [5]:
# Crawl outfit data with outfit link
def crawl_outfit (day, line):
  
    # Initialize 
    outfit_link = ""
    outfit_id = 0
    outfit_title = ""
    outfit_image = ""
    outfit_author = ""
    outfit_date = ""
    outfit_views = 0
    outfit_likes = 0
    outfit_description = ""
    outfit_similars = []

    # Get request response
    while True:
        try:
            res = requests.get(line, headers = head)
        except RequestException:
            print "Outfit Request Connection Error!"
            print "Delay 1 minutes!"
            time.sleep(60) # Delay 60 seconds
        else:
            break

    # If outfit page is not found
    if res.status_code == 404:
        # Skip to next outfit
        print "HTTP Response Error " + str(res.status_code)
        print "Outfit Not Found!"
        return
    
    # When http response error
    while res.status_code != 200:
        # Print error code
        print "HTTP Response Error " + str(res.status_code)
        # Delay 3 seconds
        time.sleep(3) 
        # Requests again
        res = requests.get(line, headers = head) 
        if res.status_code == 200:
            break

    # Parse html with BeautifulSoup
    soup = BeautifulSoup(res.text, "html.parser")

    # Outfit link
    outfit_link = line.split('\n')[0]
    print "outfit link: " + outfit_link

    # Outfit id
    outfit_id = int(outfit_link.split('/')[-1].split('=')[-1])
    #print "outfit id: " + str(outfit_id)

    # Outfit title
    outfit_title = soup.find_all('meta', {'property': 'og:title'})[0]['content']
    #print "outfit title: " + outfit_title

    # Outfit image
    outfit_image = soup.find_all('meta', {'property': 'og:image'})[0]['content']
    #print "outfit image: " + outfit_image

    # Save outfit image as a jpg file
    while True:
        try: 
            image_res = requests.get(outfit_image, stream = True)
        except ConnectionError:
            print "Image Request Connection Error!"
            time.sleep(3) # Delay 3 seconds
            continue
        break

    f = open('outfit_images/' + day + '/' + day + '-' + str(outfit_id) + '.jpg', 'wb')
    shutil.copyfileobj(image_res.raw, f)
    f.close

    # Outfit author link
    outfit_author = soup.find_all('meta', {'property': 'polyvore:author'})[0]['content']
    #print "outfit author: " + outfit_author

    # Outfit create time
    outfit_date = day
    #print "outfit create: " + outfit_date

    # Outfit views count
    outfit_views = int(str(soup.select('div.meta')[0].select('div')[-1])[5:-7].split('.')[1].split()[0].replace(',', ''))
    #print "outfit views: " + str(outfit_views)

    # Outfit likes count
    outfit_likes = int(soup.find_all('meta', {'property': 'polyvore:likes'})[0]['content'])
    #print "outfit likes: " + str(outfit_likes)

    # Outfit description
    outfit_description = soup.select('div.bd')[0].select('img')[0]['alt']
    #print "outfit description: " + outfit_description

    # Outfit similar style
    for grid in soup.select('.grid_item.hover_container.type_set.span2w.span2h'):
        for main in grid.select('div.main'):
            outfit_similars.append('http://www.polyvore.com/' + main.select('a')[0]['href'][3:])

    # If similar outfits do not have 5 outfits, append empty one
    if len(outfit_similars) != 5:
        for i in xrange(5 - len(outfit_similars)):
            outfit_similars.append("")

#         for i in xrange(len(outfit_similars)):
#             print "outfit similar " + str(i) + ": " + outfit_similars[i]     



    ############################# Insert outfit data into database #############################    

    # Insert outfit data into outfit table
    outfit_query = '''INSERT INTO outfits 
                        (outfit_id, link, title, image, author, create_date, views, likes, 
                        description, similar1, similar2, similar3, similar4, similar5) 
                        VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'''
    outfit_data = (outfit_id, outfit_link, outfit_title, outfit_image, outfit_author, outfit_date, 
            outfit_views, outfit_likes, outfit_description, 
            outfit_similars[0], outfit_similars[1], outfit_similars[2], outfit_similars[3], outfit_similars[4])

    try:
        cur.execute(outfit_query, outfit_data)
    except psycopg2.Error, e:
        print "Outfit pass!"
        print e.pgerror
        pass

    conn.commit()


    #################################### Items in the outfit ###################################

    # initialize
    item_count = 0
    item_link = []   

    # Outfit set_item grid
    done = False
    for grid in soup.select('.grid_item.hover_container.type_thing.span1w.span1h'):
        for main in grid.select('div.main'): 

            print "item" + str(item_count)

            item_link.append('http://www.polyvore.com/' + main.select('a')[0]['href'][3:])
            #print "item link " + str(item_count) + ": " + item_link[item_count]

            # Crawl item data, if item is not in category, then next item is not too
            if crawl_item(day, outfit_id, item_link[item_count]) == 0:

                # If first item is not in category, the outfit is not a fashion outfit
                if item_count == 0:

                    print "Remove outfit!"
                    # Delete this outfit row in outfit table
                    outfit_delete = '''DELETE FROM outfits 
                                        WHERE outfit_id = ''' + str(outfit_id) + ''';'''
                    cur.execute(outfit_delete)
                    conn.commit()

                    # Remove this outfit image file
                    os.remove('outfit_images/' + day + '/' + day + '-' + str(outfit_id) + '.jpg')

                done = True
                break # Break the inner for loop

            item_count += 1

#         # If done is true, we do not need to run the next item
        if done: 
            break # Break the outer for loop
       
    return


In [6]:
# Crawler for all outfits and items in one day
def crawler(day):

    # Create new folder with day name for storing outfit images
    if not os.path.exists('outfit_images/' + day):
        os.makedirs('outfit_images/' + day)

    # Create new folder with day name for storing item images
    if not os.path.exists('item_images/' + day):
        os.makedirs('item_images/' + day)
    
    # Initialize
    outfit_count = 0

    # Open the file
    with open('outfit_links/Today_Outfit_Links_'+ day +'.txt', 'r') as f:
        
        # Read every links in this txt file
        for line in f.readlines():
            
            print "outfit" + str(outfit_count)
            
            if outfit_count != 0 and outfit_count % 50 == 0:
                print "Delay 1 minute!"
                time.sleep(60) # Delay 60 seconds

            # Crawl outfit
            crawl_outfit(day, line)

            print "\n"
            outfit_count += 1
            
    return
    

In [7]:
day = '20171229'

crawler(day)

# Close the connection to database
cur.close()
conn.close()

print "END!"

outfit0
outfit link: https://www.polyvore.com/duke_duchess_sussex_attend_christmas/set?id=232875116
item0
item link: http://www.polyvore.com/gebe_maternity_midi_dress_in/thing?id=228377829
item1
item link: http://www.polyvore.com/dolce_gabbana_rose-embellished_bar_velvet/thing?id=223120780
item2
item link: http://www.polyvore.com/gucci_mini_black_velvet_broadway/thing?id=224428600
Item pass!
ERROR:  duplicate key value violates unique constraint "items_pkey"
DETAIL:  Key (item_id)=(224428600) already exists.

item3
item link: http://www.polyvore.com/chanel_vintage_cc_fringe_earrings/thing?id=225523402
Item pass!
ERROR:  duplicate key value violates unique constraint "items_pkey"
DETAIL:  Key (item_id)=(225523402) already exists.

item4
item link: http://www.polyvore.com/belk_co_polished_wedding_band/thing?id=197693013
Item pass!
ERROR:  duplicate key value violates unique constraint "items_pkey"
DETAIL:  Key (item_id)=(197693013) already exists.

item5
item link: http://www.polyvore.co