In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re   

%matplotlib inline

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

loc = '/usr/local/bin/chromedriver' #location of Chrome webdriver
options = Options()
options.add_argument('--headless')
options.add_argument('--log-level=3')

driver = webdriver.Chrome(loc, options=options) #chrome_options

### Scraping title and links of all valid bonus posts

In [3]:
def get_title_from_post_title(post_title):
    including = re.search(r'targeted|business|miles', post_title, re.I) # remove business and targeted offer
    not_including = re.search(r'\$|checking|saving', post_title, re.I)

    if including or not not_including:          
        return
    else:
        post_title = re.sub(r'\[.*?\] ', '', post_title, re.I)
        post_title = re.sub(r'[\[\(].*?[\]\)]', '', post_title, re.I)
        post_title = re.sub(r'–\s[a-zA-Z\s]*', '', post_title, re.I)
    
    return post_title

In [4]:
def scrape_bonus_posts(page_i, page_f):
    main_url = 'https://www.doctorofcredit.com/category/bank-account-bonuses/' # page 7-54 for year 2019
    bonus_posts = []
    
    for i in range(page_i, page_f+1):
        page_num = 'page/' + str(i) + '/' # or 'page/7/'
        driver.get(main_url+page_num)

        for post_url in driver.find_elements_by_xpath("//h2[@class='omc-blog-one-heading']//a"):
            post_title = get_title_from_post_title(post_url.text)
            if post_title:
                bonus_posts.append([post_title, post_url.get_attribute('href')])
            else:
                continue
    
    return bonus_posts

In [5]:
bonus_posts = scrape_bonus_posts(1, 30)

In [6]:
len(bonus_posts)

213

In [7]:
bonus_posts_df = pd.DataFrame(bonus_posts, columns=['title', 'post_link'])
bonus_posts_df.to_csv('../data/bonus_posts_page_1_30.csv')

### Scraping post details of each bonus

In [8]:
for i in range(len(bonus_posts_df)):
    driver.get(bonus_posts_df.loc[i, 'post_link'])
    
    try:
        glance = driver.find_elements_by_xpath('//*[@id="omc-full-article"]/ul[1]')[0].text
    except:
        glance = np.nan
    bonus_posts_df.loc[i, 'glance'] = glance
    
    try:
        offer = driver.find_elements_by_xpath('//*[@id="omc-full-article"]/ul[2]')[0].text
    except:
        offer = np.nan
    bonus_posts_df.loc[i, 'offer'] = offer
        
    bonus_posts_df.loc[i, 'fee'] = ''  
    for j in range(1, 6):
        xpath = '//*[text()="Avoiding Fees"]/following::p[{}]'.format(j)
        try:
            bonus_posts_df.loc[i, 'fee'] += str(driver.find_element_by_xpath(xpath).text)
        except:
            bonus_posts_df.loc[i, 'fee'] += ''

In [9]:
bonus_posts_df.head()

Unnamed: 0,title,post_link,glance,offer,fee
0,Chesapeake Bank $100 Savings Bonus,https://www.doctorofcredit.com/va-only-chesape...,Maximum bonus amount: $100\nAvailability: VA o...,Chesapeake Bank is offering a bonus of $100 wh...,The totally free checking account has no month...
1,Arsenal Credit Union $100 Checking Bonus,https://www.doctorofcredit.com/mo-only-arsenal...,Maximum bonus amount: $100\nAvailability: MO o...,Arsenal Credit Union is offering a bonus of $1...,This account has no monthly fees to worry abou...
2,The Cooperative Bank $200 Checking Bonus,https://www.doctorofcredit.com/ma-only-the-coo...,Maximum bonus amount: $200\nAvailability: MA o...,The Cooperative Bank is offering a bonus of $2...,"This account has a $10 monthly fee, this is wa..."
3,Old Second Bank $200 Checking Bonus,https://www.doctorofcredit.com/il-only-old-sec...,Maximum bonus amount: $200\nAvailability:Must ...,Old Second Bank is offering a $200 checking bo...,Money connection account has a $3.95 fee if yo...
4,BBVA $400 Checking Bonus + $50 Savings Bonus &...,https://www.doctorofcredit.com/bbva-400-checki...,Maximum bonus amount: $450\nAvailability: Nati...,BBVA Compass is offering a checking bonus of $...,"If you live in AL, AZ, CA, CO, FL NM or TX the..."


In [11]:
len(bonus_posts_df)

213

In [12]:
bonus_posts_df.to_csv('../data/bonus_post.csv')