# Canon DLSR Scraper

### Importing Libraries

In [19]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from more_itertools import unique_everseen
from pprint import pprint
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from collections import OrderedDict
from datetime import date
import time
import csv
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

### Starting the WebDriver

In [20]:
#Starting the driver and finding the query element
driver = webdriver.Chrome()
driver.get("https://washingtondc.craigslist.org/nva")
search = driver.find_element_by_id("query")

In [21]:
#Inputing the search keyword
search.send_keys("Canon")
search.send_keys(Keys.ENTER)

In [22]:
#Selecting the photo/video and electronics cateories
driver.find_element_by_css_selector('input.catcheck.selectallcb').click()
driver.find_element_by_css_selector(".catcheck.multi_checkbox[id='cat_pha']").click()
driver.find_element_by_css_selector(".catcheck.multi_checkbox[id='cat_ela']").click()
driver.find_element_by_css_selector("button.searchlink.linklike.changed_input.clickme").click()

#Bundling duplicates, requiring pictures, not including nearby searches
driver.find_element_by_name("bundleDuplicates").click()
driver.find_element_by_name("hasPic").click()

In [23]:
#Inputing the min and max price
minP = driver.find_element_by_name("min_price")
minP.send_keys(500)
time.sleep(1)
maxP = driver.find_element_by_name("max_price")
maxP.send_keys(1500)
maxP.send_keys(Keys.ENTER)

### Gathering Eligible Links

In [24]:
#Prepping for beautiful soup
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

In [25]:
#Going through all content divs and finding the links
nduplinks = []
duplinks=[] 
#Finding the non duplicated results
nonDups = soup.findAll('li', {'class': 'result-row'})
for item in nonDups:
    nduplinks.append(item.find('a').get('href'))
#Finding the duplicated results
dups = soup.findAll('ul', {'class': 'duplicate-rows'})
for item in dups:
    duplinks.append(item.find('a').get('href'))

In [26]:
#Removing the duplicates, returning the links list
links = [x for x in nduplinks if x not in duplinks]

#Removing duplicate links due to pictures and titles both being the same
links = list(unique_everseen(links))
while "#" in links:
    links.remove("#")

# FOR TEST PURPOSES

In [27]:
links = links[:5]

# -------------------------------------------

In [28]:
#Clicking into the item links
count = 0
dailyitems=[]
itemDict = []
for link in links:
    #Clicking the item link
    link = '"'+link+'"'
    element = driver.find_element_by_xpath('//a[@href=%s]' %link)
    driver.execute_script('arguments[0].click();', element)

    #Getting the item html for beautiful soup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    #Getting the item price and title
    price = soup.find('span', {'class': 'price'}).get_text()
    title = soup.find('span', {'id': 'titletextonly'}).get_text()
    #Getting the item text
    text = soup.find('section', {'id':'postingbody'}).get_text()
    while "\n" in text:
        text = text.replace("\n", "")
    while "QR Code Link to This Post" in text:
        text = text.replace("QR Code Link to This Post", "")
    itemD = {"Title":title, "Price":price, "Description":text}

    #Making a list of dictionaries
    itemDict.append(itemD)

    #Adding to a list
    dailyitems.append([itemD['Title'], itemD['Price'], itemD['Description']])
    count += 1
    driver.back()

In [29]:
#Outputting the results array item by item
driver.close()
#pprint(dailyitems)
df = pd.DataFrame(itemDict)
current_listings = df
#To Generate the first days csv
#current_listings.to_csv('CanonCurrentListings.csv', index=False, encoding='utf-8')

In [30]:
#Removing the dollar sign in current listings:
if current_listings.Price.dtypes.name != 'int64':
    current_listings['Price'] = pd.to_numeric(current_listings['Price'].str.replace('$', ''))

#Outputting current listings
current_listings = current_listings[['Title', 'Description', 'Price']]

### Recording the Time the Script Finished Scraping

In [31]:
time_ran = dt.datetime.now()

# Begin Comparisons 

### Read in Past 'current' Listings

In [116]:
#Loading the listings from the day before (Dataframe1)
old_listings = pd.read_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/CanonCurrentListings.csv')

### Combine Common and New Listings


In [None]:
# If old listing title is not in current listing, assume the item has sold
common = old_listings.merge(current_listings, on=['Title', 'Description'])
common['Current_Price'] = common['Price']
common.drop('Price', axis=1, inplace=True)

In [None]:
#Finding the items not already being tracked
new_items=(current_listings[(~current_listings.Title.isin(common.Title))])

#Adding the list day to new items dataframe
new_items['List_Date'] = time_ran.date()

#Renaming price to be List Price 
new_items.rename(columns = {'Price':'List_Price'}, inplace=True)

#Reflect the Current Price to be equal to the list price
new_items['Current_Price'] = new_items['List_Price']

In [None]:
#Adding new items to the common listings
today_listings = common.append(new_items)

#Writing the listings to csv to be used tomorrow
today_listings.to_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/CanonCurrentListings.csv', index=False, encoding='utf-8')

### Addressing Sold Items

In [109]:
merged = old_listings.merge(current_listings, indicator=True, how='outer', left_on=['Title', 'Description'], right_on=['Title', 'Description'])
sold_today = merged[merged['_merge'] == 'left_only']

In [110]:
if sold_today.shape[0] != 0:
    sold_today.drop(['_merge'], axis=1, inplace=True)
    sold_today.rename(columns={'Price':'Sell_Price'}, inplace=True)
    sold_today['Sell_Date'] = time_ran.date()
    
    #Loading the sold items csv
    sold_items = pd.read_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/SoldItems.csv')
    
    #Adding items sold today to sold_items
    sold_items = sold_items.append(sold_today)
    
    #Writing to the sold items csv
    sold_items.to_csv('C:/Users/mikes/Documents/Python Scripts/Canon_Scraper/Data_Tables/SoldItems.csv', index=False, encoding='utf-8')