# Canon DLSR Scraper

### Importing Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from more_itertools import unique_everseen
from pprint import pprint
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from collections import OrderedDict
from datetime import date
import time
import csv
import datetime as dt
import warnings
from fuzzywuzzy import fuzz
warnings.filterwarnings('ignore')

### Starting the WebDriver

In [2]:
#Starting the driver and finding the query element
driver = webdriver.Chrome()
driver.get("https://washingtondc.craigslist.org/nva")
search = driver.find_element_by_id("query")

In [3]:
#Inputing the search keyword
search.send_keys("Canon")
search.send_keys(Keys.ENTER)

In [4]:
#Selecting the photo/video and electronics cateories
driver.find_element_by_css_selector('input.catcheck.selectallcb').click()
driver.find_element_by_css_selector(".catcheck.multi_checkbox[id='cat_pha']").click()
driver.find_element_by_css_selector(".catcheck.multi_checkbox[id='cat_ela']").click()
driver.find_element_by_css_selector("button.searchlink.linklike.changed_input.clickme").click()

#Bundling duplicates, requiring pictures, not including nearby searches
driver.find_element_by_name("bundleDuplicates").click()
driver.find_element_by_name("hasPic").click()

In [5]:
#Inputing the min and max price
minP = driver.find_element_by_name("min_price")
minP.send_keys(500)
time.sleep(1)
maxP = driver.find_element_by_name("max_price")
maxP.send_keys(1500)
maxP.send_keys(Keys.ENTER)

### Gathering Eligible Links

In [6]:
#Prepping for beautiful soup
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

In [7]:
#Going through all content divs and finding the links
nduplinks = []
duplinks=[] 

#Finding the non duplicated results
nonDups = soup.findAll('li', {'class': 'result-row'})
for item in nonDups:
    nduplinks.append(item.find('a').get('href'))
    
#Finding the duplicated results
dups = soup.findAll('ul', {'class': 'duplicate-rows'})
for item in dups:
    duplinks.append(item.find('a').get('href'))

In [8]:
#Removing the duplicates, returning the links list
links = [x for x in nduplinks if x not in duplinks]

#Removing duplicate links due to pictures and titles both being the same
links = list(unique_everseen(links))
while "#" in links:
    links.remove("#")

In [9]:
#Clicking into the item links
count = 0
dailyitems=[]
itemDict = []
for link in links:
    #Clicking the item link
    link = '"'+link+'"'
    element = driver.find_element_by_xpath('//a[@href=%s]' %link)
    driver.execute_script('arguments[0].click();', element)

    #Getting the item html for beautiful soup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    
    #Getting the item price and title
    price = soup.find('span', {'class': 'price'}).get_text()
    title = soup.find('span', {'id': 'titletextonly'}).get_text()
    
    #Getting the item text
    text = soup.find('section', {'id':'postingbody'}).get_text()
    while "\n" in text:
        text = text.replace("\n", "")
    while "QR Code Link to This Post" in text:
        text = text.replace("QR Code Link to This Post", "")
    itemD = {"Title":title, "Price":price, "Description":text}

    #Making a list of dictionaries
    itemDict.append(itemD)

    #Adding to a list
    dailyitems.append([itemD['Title'], itemD['Price'], itemD['Description']])
    count += 1
    driver.back()

In [10]:
#Outputting the results array item by item
driver.close()

#Creating a dataframe of the items
current_listings = pd.DataFrame(itemDict)

In [11]:
#Removing the dollar sign in current listings:
if current_listings.Price.dtypes.name != 'int64':
    current_listings['Price'] = pd.to_numeric(current_listings['Price'].str.replace('$', ''))

#Outputting current listings
current_listings = current_listings[['Title', 'Description', 'Price']]

In [12]:
#Recording the Time the Script Finished Crawling
time_ran = dt.datetime.now()

# Update Current Listings and Sold Items

### Read in Current Listings from Yesterday

In [13]:
#Loading the listings from the day before (Dataframe1)
old_listings = pd.read_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/CanonCurrentListings.csv')

### Merge Yesterday's and Today's Listings With the Same Title and Description

In [14]:
common = old_listings.merge(current_listings, on=['Title', 'Description'])
common['Current_Price'] = common['Price']
common.drop('Price', axis=1, inplace=True)

### Consider Cases Where the Description Has Been Changed Slightly

In [15]:
#Finding today's listings not perfectly matched in yesterdays listings
compDF = current_listings.merge(old_listings, indicator=True, how='outer', on=['Title', 'Description'])
only_today = compDF[compDF['_merge'] == 'left_only']

#Finding items in yesterdays listings not perfectly matched in todays listings
only_old = compDF[compDF['_merge'] == 'right_only']

For each item that is only in yesterdays listings, we need to compare the similarity of description to every item that is only in today's listing. If the fuzz ratio is greater than 85 (out of 100), we will consider the items a match and merge the old and new values.  

In [16]:
#Storing the matched listings between today and old only
cols = ['Current_Price','Description','List_Date','List_Price','Title']
matches = pd.DataFrame(columns=cols)

#Performing the matching
for old_index, old_row in only_old.iterrows():
    for new_index, new_row in only_today.iterrows():
        if fuzz.ratio(old_row.Description, new_row.Description) > 85:
            #Add listing to matches dataframe
            matches = matches.append({'Current_Price':new_row.Price, 
                            'Description':new_row.Description, 
                            'List_Date':old_row.List_Date,
                            'List_Price':old_row.List_Price, 
                            'Title':new_row.Title}, ignore_index=True)
            
            #Remove item from old_only
            only_old = only_old[~(only_old.Description == old_row.Description)]
            
            #Remove item from new_only
            only_today = only_today[~(only_today.Description == new_row.Description)]
         
            #Break out of the inner looping
            break

In [17]:
#Add the matches to the common listings
common = common.append(matches)

### Adding New Items to Current Listings

In [18]:
#Adding the list day to new items dataframe
new_items = only_today[['Title', 'Description', 'Price']]

new_items['List_Date'] = time_ran.date()

#Renaming price to be List Price 
new_items.rename(columns = {'Price':'List_Price'}, inplace=True)

#Reflect the Current Price to be equal to the list price
new_items['Current_Price'] = new_items['List_Price']

#Adding new items to the common listings
today_listings = common.append(new_items)

In [19]:
#Writing the listings to csv to be used tomorrow
today_listings.to_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/CanonCurrentListings.csv', index=False, encoding='utf-8')

### Addding Items Sold Today to Sold Items

In [20]:
sold_today = only_old

#Updating the historical table of sold items
if sold_today.shape[0] != 0:
    sold_today.drop(['_merge', 'Price'], axis=1, inplace=True)
    sold_today.rename(columns={'Current_Price':'Sell_Price'}, inplace=True)
    sold_today['Sell_Date'] = time_ran.date()
    
    #Loading the sold items csv
    sold_items = pd.read_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/SoldItems.csv')
    
    #Adding items sold today to sold_items
    sold_items = sold_items.append(sold_today)
    
    #Writing to the sold items csv
    sold_items.to_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/SoldItems.csv', index=False, encoding='utf-8')

### Items Available That Have Dropped From List_Price

In [21]:
today_listings[today_listings['List_Price'] != today_listings['Current_Price']]

Unnamed: 0,Current_Price,Description,List_Date,List_Price,Title
0,550,Canon EOS 700D + EF-S 18-55mm Excellent condit...,2018-06-24,570.0,Canon EOS 700 D with EF-S 18-55mm
3,525,Sigma 80-400mm Telephoto Lens f/4.5-5.6 EX DG ...,2018-06-24,550.0,Sigma 80-400mm f/4.5-5.6 EX DG APO OS for Cano...
6,800,Selling Sony A7 full frame mirrorless camera w...,2018-06-24,750.0,Sony a7 with 4 Batteries
37,750,For sale : Canon eos 1d Mark IV camera body. ...,2018-06-24,775.0,Canon EOS 1D Mark IV
49,1199,"The glass is flawless, had a protective filter...",2018-06-28,1299.0,Canon EF 24-70mm f/2.8L II USM Lens
50,759,"The glass is flawless, had a protective filter...",2018-06-28,799.0,Canon EF 24mm f/1.4L II USM Lens
53,600,Canon EOS 60D DSLR camera plus kit lens. 18-13...,2018-06-29,700.0,Canon EOS 60D body plus 18-135mm lens
70,815,Selling my Canon EF 400mm f/5.6 prime lens. T...,2018-06-27,825.0,Canon 400mm F/5.6 Lens
72,750,"Canon 16-35ii 2.8L lens excellent condition,ca...",2018-07-02,795.0,Canon
80,500,The EOS Rebel T6i is a Digital SLR camera with...,2018-07-04,550.0,Canon EOS Rebel T6i DSLR Camera with 18-55mm lens


### Items Sold Today

In [22]:
sold_today

Unnamed: 0,Title,Description,Sell_Price,List_Date,List_Price,Sell_Date
111,Canon Macro Lens EF 100mm f/2.8 L IS USM,Canon Macro Lens EF 100mm f/2.8 L IS USM: $700...,650.0,2018-06-24,700.0,2018-07-07
112,For Canon: Sigma A 24-35mm f/2 DG HSM,I am switching to Sony and have the following ...,700.0,2018-06-24,700.0,2018-07-07
113,Canon EF 100-400mm f/4.5-5.6L IS USM Lens,I'm selling my first gen Canon 100-400mm telep...,750.0,2018-06-24,800.0,2018-07-07
114,Canon EF 85mm f/1.2 L II AF USM Lens + FREE UV...,I am selling my Canon EF 85mm f/1.2 L II AF US...,1499.0,2018-06-29,1499.0,2018-07-07
115,Canon EF 100mm f/2.8 L IS USM Lens (LIKE NEW),I am selling my Canon EF 100mm f/2.8L Macro IS...,649.0,2018-06-29,649.0,2018-07-07
116,Canon EF 24mm f/1.4 II EF L USM (LIKE NEW),I am selling my Canon EF 24mm f/1.4 II EF L US...,1299.0,2018-06-29,1299.0,2018-07-07
117,Canon 6D Body in Excellent Condition!!,Up for sale is a Canon 6D body! This is an ama...,800.0,2018-07-02,800.0,2018-07-07
118,Canon Plotter IPF820 Pro,"We are selling a Canon plotter, it's in great ...",650.0,2018-07-05,650.0,2018-07-07


### New Items Today

In [23]:
new_items

Unnamed: 0,Title,Description,List_Price,List_Date,Current_Price
0,Excellent Condition Canon 70D Bundle,"Selling my gently loved, excellent condition C...",1250.0,2018-07-07,1250.0
5,Canon 70-200mm f/4 IS L,"Like new condition. Works great, but I never ...",750.0,2018-07-07,750.0
24,Canon Macro Lens EF 100mm f/2.8 L IS USM,Canon Macro Lens EF 100mm f/2.8 L IS USM: $675...,675.0,2018-07-07,675.0
27,Canon digital SLR 400D bag plus accessories,Canon digital SLR camera 400D Trades= sharp ...,700.0,2018-07-07,700.0
40,Subal Underwater Video Photo Dive Camera Housi...,"Rated for depths to 230 feet / 70 meters, so w...",1400.0,2018-07-07,1400.0
107,CAMERA Canon EOS Rebel T5,Like New. Used 2 TimesCanon EOS Rebel T5. With...,550.0,2018-07-07,550.0
