# Canon DLSR Scraper

### Importing Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from more_itertools import unique_everseen
from pprint import pprint
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from collections import OrderedDict
from datetime import date
import time
import csv
import datetime as dt

### Starting the WebDriver

In [2]:
#Starting the driver and finding the query element
driver = webdriver.Chrome()
driver.get("https://washingtondc.craigslist.org/nva")
search = driver.find_element_by_id("query")

In [3]:
#Inputing the search keyword
search.send_keys("Canon")
search.send_keys(Keys.ENTER)

In [4]:
#Selecting the photo/video and electronics cateories
driver.find_element_by_css_selector('input.catcheck.selectallcb').click()
driver.find_element_by_css_selector(".catcheck.multi_checkbox[id='cat_pha']").click()
driver.find_element_by_css_selector(".catcheck.multi_checkbox[id='cat_ela']").click()
driver.find_element_by_css_selector("button.searchlink.linklike.changed_input.clickme").click()

#Bundling duplicates, requiring pictures, not including nearby searches
driver.find_element_by_name("bundleDuplicates").click()
driver.find_element_by_name("hasPic").click()

In [5]:
#Inputing the min and max price
minP = driver.find_element_by_name("min_price")
minP.send_keys(500)
time.sleep(1)
maxP = driver.find_element_by_name("max_price")
maxP.send_keys(1500)
maxP.send_keys(Keys.ENTER)

### Gathering Eligible Links

In [6]:
#Prepping for beautiful soup
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

In [7]:
#Going through all content divs and finding the links
nduplinks = []
duplinks=[] 
#Finding the non duplicated results
nonDups = soup.findAll('li', {'class': 'result-row'})
for item in nonDups:
    nduplinks.append(item.find('a').get('href'))
#Finding the duplicated results
dups = soup.findAll('ul', {'class': 'duplicate-rows'})
for item in dups:
    duplinks.append(item.find('a').get('href'))

In [8]:
#Removing the duplicates, returning the links list
links = [x for x in nduplinks if x not in duplinks]

#Removing duplicate links due to pictures and titles both being the same
links = list(unique_everseen(links))
while "#" in links:
    links.remove("#")

# FOR TEST PURPOSES

In [9]:
links = links[:7]

# -------------------------------------------

In [10]:
#Clicking into the item links
count = 0
dailyitems=[]
itemDict = []
for link in links:
    #Clicking the item link
    link = '"'+link+'"'
    element = driver.find_element_by_xpath('//a[@href=%s]' %link)
    driver.execute_script('arguments[0].click();', element)

    #Getting the item html for beautiful soup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    #Getting the item price and title
    price = soup.find('span', {'class': 'price'}).get_text()
    title = soup.find('span', {'id': 'titletextonly'}).get_text()
    #Getting the item text
    text = soup.find('section', {'id':'postingbody'}).get_text()
    while "\n" in text:
        text = text.replace("\n", "")
    while "QR Code Link to This Post" in text:
        text = text.replace("QR Code Link to This Post", "")
    itemD = {"Title":title, "Price":price, "Description":text}

    #Making a list of dictionaries
    itemDict.append(itemD)

    #Adding to a list
    dailyitems.append([itemD['Title'], itemD['Price'], itemD['Description']])
    count += 1
    driver.back()

In [11]:
#Outputting the results array item by item
driver.close()
#pprint(dailyitems)
df = pd.DataFrame(itemDict)
current_listings = df
#To Generate the first days csv
#current_listings.to_csv('CanonCurrentListings.csv', index=False, encoding='utf-8')

In [12]:
#Removing the dollar sign in current listings:
if current_listings.Price.dtypes.name != 'int64':
    current_listings['Price'] = pd.to_numeric(current_listings['Price'].str.replace('$', ''))

#Outputting current listings
current_listings = current_listings[['Title', 'Description', 'Price']]

### Recording the Time the Script Finished Scraping

In [13]:
time_ran = dt.datetime.now()

# Begin Comparisons 

### Read in Past 'current' Listings

In [29]:
#Loading the listings from the day before (Dataframe1)
old_listings = pd.read_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/CanonCurrentListings.csv')
old_listings

Unnamed: 0,Title,Description,List_Price,List_Date
0,SIGMA 20mm f/1.4 ART for CANON Wide Angle Came...,I bought this lens brand new from BH on March ...,600,6/16/2018
1,"Canon 16-35 F2.8 L II Mint Condition, Flawless",Up for sale is a Canon Canon 16-35 F2.8 L II i...,850,6/16/2018
2,Low Shutter Canon 6D + 50mm 1.8 + Extra Batteries,I have a Canon 6D in great condition with extr...,770,6/16/2018


In [56]:
current_listings = current_listings.iloc[:4]

### Finding Sold Items

In [69]:
# If old listing title is not in current listing, assume the item has sold
common = old_listings.merge(current_listings, on=['Title', 'Description'])
common.rename(columns = {'Price': 'Current Price'}, inplace=True)

In [83]:
merged = old_listings.merge(current_listings, indicator=True, how='outer')
sold_today = merged[merged['_merge'] == 'left_only']

In [85]:
sold_today.drop(['_merge'], axis=1, inplace=True)
sold_today.rename(columns={'Price':'Sell_Price'}, inplace=True)
sold_today['Sell_Date'] = time_ran.date()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [86]:
sold_today

Unnamed: 0,Title,Description,List_Price,List_Date,Sell_Price,Sell_Date
2,Low Shutter Canon 6D + 50mm 1.8 + Extra Batteries,I have a Canon 6D in great condition with extr...,770.0,6/16/2018,,2018-06-16


In [93]:
#Loading the sold items csv
sold_items = pd.read_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/SoldItems.csv')

In [96]:
#Adding items sold today to sold_items
sold_items = sold_items.append(sold_today)

Unnamed: 0,Title,Description,List_Price,List_Date,Sell_Price,Sell_Date
0,Low Shutter Canon 6D + 50mm 1.8 + Extra Batteries,I have a Canon 6D in great condition with extr...,770.0,6/16/2018,,2018-06-16
2,Low Shutter Canon 6D + 50mm 1.8 + Extra Batteries,I have a Canon 6D in great condition with extr...,770.0,6/16/2018,,2018-06-16


In [92]:
#Writing to the sold items csv
sold_items.to_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/SoldItems.csv', index=False, encoding='utf-8')

In [97]:
current_listings

Unnamed: 0,Title,Description,Price
0,Canon DSLR Lenses,Selling the following lenses all in excellent ...,1000
1,SIGMA 20mm f/1.4 ART for CANON Wide Angle Came...,I bought this lens brand new from BH on March ...,600
2,"Canon 16-35 F2.8 L II Mint Condition, Flawless",Up for sale is a Canon Canon 16-35 F2.8 L II i...,850
3,Canon Prime Lens 135mm 2.0,"Used but in excellent condition , barely used!...",500


### Start Here for work 6/9/18

In [None]:
#Finding rows in yesterdays listings that are also in todays listing
common = old_listings[old_listings.columns.difference(['Price'])].merge(current_listings, on=['Title', 'Description'])
common

In [None]:
#Finding the items not already being tracked
new_items=(current_listings[(~current_listings.Title.isin(common.Title))])

#Adding the list day to new items dataframe
new_items['List_Date'] = time_ran.date()

#Renaming price to be List Price 
new_items.rename(columns = {'Price':'List_Price'}, inplace=True)

In [None]:
new_items

In [None]:
#Writing the listings to csv to be used tomorrow
new_items.to_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/CanonCurrentListings.csv', index=False, encoding='utf-8')

## Start of Visuals

In [None]:
#Joining descriptions for a wordcloud
s = ''
for item in dailyitems:
    s += item[2]
# print(s)
# wordcloud = WordCloud().generate(s)
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# # lower max_font_size
# wordcloud = WordCloud(max_font_size=40).generate(text)
# plt.figure()
# plt.imshow(wordcloud, interpolation="bilinear")
# plt.axis("off")
# plt.show()

In [None]:
#Making an image in the shape of the camera
# read the mask image
col = Image.open("/Users/mikes/Documents/GetawayDevelopment/camera.jpg")
gray = col.convert('L')
bw = gray.point(lambda x: 0 if x<230 else 255, '1')
bw.save("/Users/mikes/Documents/GetawayDevelopment/camera_mask.jpg")

camera_mask = np.array(Image.open("/Users/mikes/Documents/GetawayDevelopment/camera_mask.jpg"))

wc = WordCloud(background_color="white", max_words=2000, mask=camera_mask)
# generate word cloud
wc.generate(s)

# store to file
wc.to_file("/Users/mikes/Documents/GetawayDevelopment/canon.jpg")

# show
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()

#This plots the grey-scale camera
plt.imshow(camera_mask, cmap='gray', interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df.Price.std()

In [None]:
#Accessing the price dataframe
#df.Price = df.Price.str.replace("$", "") !PROBABLY SHOULDN"T BE COMMENTED OUT!!!!
df.Price = pd.to_numeric(df.Price)
df

In [None]:
import plotly.plotly as py
from plotly.graph_objs import *

data = [Bar(x=df.Title, y = df.Price)]
py.iplot(data)