# A Python bases script to read in credit card reviews from `cardraitings.com`, collect reward information from those credit cards, and try to find the best combination of cards for an individual's expenses

In [1]:
import os
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import pickle


In [2]:
# Opening us the website that contains links to reviews for 
# the cedit cards we will be investiating

parent_url = 'https://www.cardratings.com/credit-card-list.html'
res = requests.get(parent_url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
cc_urls = []
for link in soup.find_all('a', href=True):
    if 'cardratings.com/credit-card/' in link.get('href'):
        print(link.get('href'))
        cc_urls.append(link.get('href'))


-card-review
https://www.cardratings.com/credit-card/phi-kappa-phi-rewards-visa-credit-card-review
https://www.cardratings.com/credit-card/philadelphia-eagles-nfl-extra-points-from-barclays-bank-delaware
https://www.cardratings.com/credit-card/philadelphia-phillies-bankamericard-cash-rewards-mastercard
https://www.cardratings.com/credit-card/picknsave-metro-market-rewards-world-mastercard
https://www.cardratings.com/credit-card/pier-1-imports-rewards-credit-card-review
https://www.cardratings.com/credit-card/pittsburgh-pirates-bankamericard-cash-rewards-mastercard
https://www.cardratings.com/credit-card/pittsburgh-steelers-nfl-extra-points-from-barclays-bank-delaware
https://www.cardratings.com/credit-card/plains-commerce-bank-mastercard-classic.html
https://www.cardratings.com/credit-card/plains-commerce-bank-visa-classic
https://www.cardratings.com/credit-card/plains-commerce-bank-visa-gold-card
https://www.cardratings.com/credit-card/popular-bank-platinum-mastercard-review
https://w

In [5]:


def get_annual_fee(string):
    """
    A function to find the annual fee (or dollar amount) in a string
    
    Inputs:
    - string (str), a string containing a $dollar amount
    
    Output (float), the dollar amount of the annual fee
    """
    try:
        return float(re.search('\$\d+', string).group()[1:])
    except AttributeError:
        return 0.
    
def get_annual_bonus(string):
    """
    A function to look through a string and try and find the annual 
    bonuses that you get with a particular card that offsets the annual
    fee of those cards
    
    Inputs:
    - string (str), a string that contians a description of the card bonuses
    
    Outputs:
    - credits (float), the dollar amount of the annual bonus
    """
    credits = 0
    for line in string.split('\n'):
        credit_words = ['saving', 'credits'] # we want to see these words in a sentence
        comparitive_words = ['higher', 'lower', 'worse', 'better'] # we don't want to see these words
        if any(map(lambda x: x in line.lower(), credit_words)) and not any(map(lambda x: x in line, comparitive_words)):
            try:
                credits += max([float(credit[1:]) for credit in re.findall('\$\d+', line) if float(credit[1:]) >= 50])
            except ValueError:
                pass
    return credits

def get_rewards_info_from_url(url):
    """
    A function that will request the html data from a url and find
    the rewards, annual fee, and annual bonuses for that link. This
    function isn't 100% working (issues with annual fee and bonus). But
    can parse out the rewards.
    
    Inputs:
    - url (str), a string representation of the url
    
    Outputs:
    - title (str), the title of the post
    - rewards (dict), a dictionary containing the rewards, annual_fee
        annual_bonus, and net_fee
    """
    
    print()
    print()
    res = requests.get(url)
    html_page = res.content
    soup = BeautifulSoup(html_page, 'html.parser')
    title = soup.title.text
    print(title)
    print(url)
    if '400' in title:
        rewards = {
            'flights':0,
            'hotel':0,
            'grocery':0,
            'gas':0,
            'utilities':0,
            'restaurants':0,
            'other':0 
        }
        rewards['annual_fee'] = 0
        rewards['annual_bonus'] = 0
        rewards['net_fee'] = 0
        return None, rewards
        
    d = soup.find('div', itemprop='description')
    try:
        df = pd.read_html(d.decode())[0].T.set_index(0)
    except:
        rewards = {
            'flights':0,
            'hotel':0,
            'grocery':0,
            'gas':0,
            'utilities':0,
            'restaurants':0,
            'other':0 
        }
        rewards['annual_fee'] = 0
        rewards['annual_bonus'] = 0
        rewards['net_fee'] = 0
        return title, rewards
        
        
    if 'Rewards' in df.index:
        rewards = get_rewards_from_string(df[1]['Rewards'])
        try:
            rewards['annual_fee'] = get_annual_fee(df[1]['Annual Fee'])
        except KeyError:
            rewards['annual_fee'] = 0
            for line in d.text.split('\n'):
                if 'annual fee' in line.lower():
                    rewards['annual_fee'] = get_annual_fee(line)
                    break
            
            
        rewards['annual_bonus'] = get_annual_bonus(d.text)
        rewards['net_fee'] = rewards['annual_fee'] - rewards['annual_bonus']
    else:
        rewards = get_rewards_from_string(d.text)
        rewards['annual_fee'] = 0
        rewards['annual_bonus'] = 0
        rewards['net_fee'] = 0
    return title, rewards
    

In [8]:
url

'https://www.cardratings.com/credit-card/aboc-platinum-rewards-credit-card.html'

# The following functions are what we're gonna move over to our module 

In [194]:
# A list of keywords we're looking for 
KEY_WORDS = {
    'flights':['flights', 'airlines','travel', 'air', 'southwest', 'fly'],
    'hotel': ['travel', 'hotel'] ,
    'grocery':['supermarket', 'grocery', 'groceries'],
    'gas':['station', 'gas'],
    'utilities':['telephone', 'shipping', 'internet', 'cabel'],
    'restaurants':['restaurants', 'dining'],
    'other':['select', 'rotating']
}

def make_soup(url):
    """ Make a soup object from a url"""
    return BeautifulSoup(requests.get(url).content, 'lxml')


def find_card_name(soup):
    """Find the name of the credit card"""
    tag = soup.find('span', attrs={'itemprop':'name'})
    if not tag:
        return 'Unknown'
    title_list = []
    for t in tag.text.split():
        if '-' in t.lower() or 'review' in t.lower():
            break
        title_list.append(t)
    return ' '.join(title_list)


def find_req_credit(soup):
    """Get the credit required to apply for this card"""
    for tag in soup.find_all('div', attrs={'class':'credit-score-needed'}):
        child = tag.find('div')
        if child:
            return child.text
    return 'Unknown'


def get_card_type(name):
    """Get the type of credit card it is from it's name"""
    possible_names = ['Visa', 'MasterCard', 'American Express', 'Discover']
    for possible_name in possible_names:
        if possible_name.lower() in name.lower(): 
            return possible_name
    return 'Unknown'


def get_df_from_tag(tag):
    """Helps us find this 'rewards table' that exists in many pages """
    try:
        table = tag.find('table', attrs={'class': 'primaryCatgrid'})
        return pd.read_html(table.decode())[0].T.set_index(0)
    except:
        return None


def get_rewards_from_string(string, category, key_words):
    """Get rewards specific for a category with given keywords """
    if not isinstance(string, str):
        return 0.0

    for sentence in string.replace('U.S.', 'US').split('. '):
        if any(map(lambda x: x in sentence.lower(), key_words)):
            try:
                multiplyer = float(re.search('(\d+(?:\.\d+)?)', sentence).group())
                if multiplyer > 15:
                    continue
                return multiplyer
            except AttributeError:
                continue

    return 0.0

def get_rewards_from_df(df, category, key_words):
    """Use a few of the above functions to find rewards"""
    if 'Rewards' in df.index:
        return get_rewards_from_string(df[1]['Rewards'], category, key_words)
    else:
        return None

def get_rewards_from_soup(soup, category, key_words):
    """Combines functions from above fo get all rewards from a soup object """
    tag = soup.find('div', attrs={'itemprop':'description'})
    df = get_df_from_tag(tag)
    if df is None:
        return 0
    rewards = get_rewards_from_df(df, category, key_words)
    if not rewards:
        return get_rewards_from_string(tag.text, category, key_words)
    return rewards


def fix_flat_cash_back(rewards_dict, categories):
    """Helper function to fix cards that have 1.5 cash back on everything"""
    for category in categories:
        rewards_dict[category] = 1.5
    return rewards_dict

def get_application_link(soup):
    tag = soup.find('div', attrs={'class': 'card-header'})
    link = tag.find('a')
    if link:
        return link.get('href', None)
    return None


def find_rotating(soup):
    tags = soup.find_all('div', attrs={'itemprop':'description'})
    for tag in tags:
        if 'rotating' in tag.text.lower():
            return True
    return False

def get_annual_fee(soup):
    df = get_df_from_tag(soup)
    if df is not None and 'Annual Fee' in df.index:
        try:
            fee = float(df.loc['Annual Fee'][1].replace('$', ''))
        except ValueError:
            return 0.0
    fee = 0
    found_one  = False
    for tag in soup.find_all('div', attrs={'itemprop':'description'}):
        if 'annual fee' in tag.text.lower().replace('no annual fee', ''):
            print(tag.text)
            found_one = True
    if not found_one:
        return 0
    assert False
        


tag = soup.find('div', attrs={'itemprop':'description'})
card_information['rotating'] = False
rewards = {
    'flights':0,
    'hotel':0,
    'grocery':0,
    'gas':0,
    'utilities':0,
    'restaurants':0,
    'other':0 
}
# try looking for a list with 'benefits' in it
try:
    benefits = tag.find('ul').text.split('\n')
    for benefit in benefits:
        print(benefit)

except AttributeError:
    # There's no list 
    pass
        
card_information

{'rotating': False}

In [195]:
int_results = []
import random
random.shuffle(cc_urls)
for url in cc_urls[:100]:
    r = {}
    soup = make_soup(url)
    r['name'] = find_card_name(soup)
    r['card_type'] = get_card_type(r['name'])
    
    r['req_credit'] = find_req_credit(soup)
    all_15 = False
    for category, key_words in KEY_WORDS.items():
        reward = get_rewards_from_soup(soup, category, key_words)
        if reward == 1.5:
            all_15 = True
        r[category] = get_rewards_from_soup(soup, category, key_words)

    if all_15: 
        r = fix_flat_cash_back(r, KEY_WORDS.keys())
    r['rotating'] = find_rotating(soup)
    r['annual_fee'] = get_annual_fee(soup)
    r['review'] = url
    r['app_link'] = get_application_link(soup)
    int_results.append(r)





Bulk shopping enthusiasts rejoice; the Costco Anywhere Visa® Card by Citi is a cash-back credit card designed to reward you for your love of giant jars of pickles and enough paper towels to last a year as well as your everyday needs like a tank of gas. Exclusively for Costco members, Costco Visa® cardholders earn cash rewards at Costco, as you'd expect, but on other purchases as well making it a solid everday card. Citi is a CardRatings advertiser.
COSTCO CITI CARD BENEFITS
Here's how the rewards categories break down with the Costco Anywhere Visa® Card by Citi:


Gas: Earn 4% cash back on gas purchases worldwide, including gas at Costco, for up to the first $7,000 per year in spending (1% after that).

Restaurants and travel: Earn 3% cash back on restaurant purchases and eligible travel purchases worldwide, including airfare, hotels, car rentals, travel agencies, cruise lines and Costco Travel.

Costco: Earn 2% cash back on all other purchases from Costco and Costco.com.

All other 

AssertionError: 

In [200]:
for tag in soup.find_all('div', attrs={'itemprop':'description'}):
    for paragraph in tag.find_all('p'):
        if 'annual fee' in paragraph.text.lower().replace('no annual fee', ''):
            print(paragraph.text)
            print()

Perhaps the greatest drawback is the lack of flexibility when it comes to redeeming your rewards. You'll need to wait until February of each year to redeem your cash rewards from the previous year's purchases and the only way to redeem that cash back is through the aforementioned rewards certificate you'll receive in the mail. Card customers looking for a speedier, more efficient way to redeem cash rewards may want to consider other cash-back cards that allow ongoing rewards redemption or redemption directly into a checking or savings account. And, while the card doesn't technically have an annual fee of its own you do need to be a Costco member to apply for the card, which means that you'll pay the warehouse membership fee annually.

The PenFed Platinum Rewards Visa Signature® Card offers a competitive five points per dollar spent (the equivalent of 5% cash back) on gas purchases as well as three points per dollar spend on supermarket purchases, and one point on all other purchases. I

In [201]:
all_rewards_df = pd.DataFrame(int_results)
all_rewards_df

Unnamed: 0,name,card_type,req_credit,flights,hotel,grocery,gas,utilities,restaurants,other,rotating,annual_fee,review,app_link
0,Flagstar Bank Visa® Platinum Card,Visa,Excellent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0,https://www.cardratings.com/credit-card/flagst...,
1,University of Montana Visa® Rewards Credit Card,Visa,Good,0.0,0.0,2.0,2.0,0.0,0.0,0.0,False,0.0,https://www.cardratings.com/credit-card/montan...,
2,Chemical Bank Secured Visa,Visa,Limited,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0,https://www.cardratings.com/credit-card/chemic...,
3,Agriculture Federal Credit Union Secured Visa®...,Visa,Limited,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0,https://www.cardratings.com/credit-card/afcu-a...,


In [44]:
if False: #not os.path.exists('rewards.pkl'): 
    # If we've run this before, we're gonna read in a pickle file
    # to save time
    total_rewards = {}
    for cc_url in cc_urls:
        if cc_url == "https://www.cardratings.com/credit-card/connect-classic":
            continue # known to be broken url
        title, rewards = get_rewards_info_from_url(cc_url)
        if title: # cleaning up the title
            title_string = ''
            review = False
            for t in title.split():
                if '-' in t.lower() or 'review' in t.lower():
                    break
                title_string += t + ' '
            total_rewards[title_string.strip()] = rewards

    # Making and saving the dataframe
    rewards_df = pd.DataFrame(total_rewards).T
    f = open('rewards.pkl', 'wb')
    pickle.dump(rewards_df, f)
else:
    print('Reading results from a previous pickle file')
    f = open('rewards.pkl', 'rb')
    rewards_df = pickle.load(f)
    
    
rewards_df

Reading results from a previous pickle file


Unnamed: 0,flights,hotel,grocery,gas,utilities,restaurants,other,annual_fee,annual_bonus,net_fee
ABOC Platinum Rewards Mastercard® Credit Card,5.0,5.0,5.0,1.0,1.0,5.0,5.0,0.0,0.0,0.0
Aer Lingus Visa Signature Card,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
American Airlines AAdvantage MileUp℠ Card,2.0,1.0,2.0,1.0,1.0,1.0,1.0,0.0,500.0,-500.0
AAA® Member Rewards Visa Signature® Card,1.0,1.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0
AACS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
Zions Bank AmaZing Cash™ for Business,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
Zions Bank AmaZing Rate™ for Business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zions Bank AmaZing Rate™ Credit Card,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zions Bank AmaZing Rewards® Credit Card,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
rewards_df.loc['ABOC Platinum Rewards Mastercard® Credit Card']

flights         5.0
hotel           5.0
grocery         5.0
gas             1.0
utilities       1.0
restaurants     5.0
other           5.0
annual_fee      0.0
annual_bonus    0.0
net_fee         0.0
Name: ABOC Platinum Rewards Mastercard® Credit Card, dtype: float64

In [7]:
# Currently don't trust the annual fee and bonus section, 
# so we're going to process based on the rewards alone

# TODO: Find accurate net_fee for all cards and use that when 
# calculating rewards
to_plot = rewards_df[rewards_df.columns[:-3]]

# Only looking at credit card with greater than 1% back at everything
to_plot = to_plot[to_plot.sum(axis=1) > len(to_plot.columns)]

# Removing cards that have a weirdly high point value
# These data are either parsed incorrectly or are hotel rewards were
# the point to cent ratio is high (e.g. many points to a cent)
to_plot = to_plot[to_plot.sum(axis=1) <= 2*len(to_plot.columns)] 
to_plot

Unnamed: 0,flights,hotel,grocery,gas,utilities,restaurants,other
American Airlines AAdvantage MileUp℠ Card,2.0,1.0,2.00,1.00,1.0,1.0,1.0
AAA® Member Rewards Visa Signature® Card,1.0,1.0,2.00,2.00,1.0,1.0,1.0
AARP® Visa® Signature Card,1.0,1.0,3.00,3.00,1.0,1.0,1.0
Agriculture Federal Credit Union Visa® Platinum (with Rebates),1.0,1.0,2.25,2.25,1.0,1.0,1.0
AmTrust Bank American Express® Silver Card,1.0,1.0,2.00,2.00,1.0,1.0,1.0
...,...,...,...,...,...,...,...
Webster Bank Visa® Business Rewards PLUS Card,1.5,1.5,1.50,1.50,1.5,1.5,1.5
Wells Fargo Business Elite Card,1.5,1.5,1.50,1.50,1.5,1.5,1.5
Wells Fargo Business Platinum Credit Card,1.5,1.5,1.50,1.50,1.5,1.5,1.5
Wells Fargo Business Secured Credit Card,1.5,1.5,1.50,1.50,1.5,1.5,1.5


In [9]:
# From my own annual expenses over the last 12 months

# Reading in a CSV file from my mint.com accout
transactions = pd.read_csv('transactions-3.csv')
transactions['Date'][0]
def compare_date(date):
    """
    A function to find if a date is within a year from now
    
    Inputs:
    - date (str), a date in the mm/dd/yyyy format
    
    Returns:
    - bool (str), a bool indicating if the date is 
        within a year from now or not
    """
    other = '4/3/2020'
    m,d,y = date.split('/')
    om,od,oy = other.split('/')
    m = int(m)
    d = int(d)
    y = int(y)
    om = int(om)
    od = int(od)
    oy = int(oy)
    if oy <= y+1:
        if om < m:
            return True
        elif om == m:
            if od < d:
                return True
            else:
                return False
        else: 
            return False
    else:
        return False
# Removing all positive transactions
transactions = transactions[transactions['Transaction Type'] == 'debit']

# Finding all transactions within a year of now
transactions = transactions[transactions['Date'].map(compare_date)]

# Parsing out annual expenses for each specific category
food = transactions[transactions.Category.map(lambda x: x.lower() in ['food & dinging', 'alcohol & bars', 'restaurants', 'fast food'])].Amount.sum()
flights = transactions[transactions.Category.map(lambda x: x.lower() in ['air travel'])].Amount.sum()
hotel = transactions[transactions.Category.map(lambda x: x.lower() in ['hotel'])].Amount.sum()
gas = transactions[transactions.Category.map(lambda x: x.lower() in ['auto & transport', 'gas & fuel'])].Amount.sum()
grocery = transactions[transactions.Category.map(lambda x: x.lower() in ['groceries'])].Amount.sum()
utilities = transactions[transactions.Category.map(lambda x: x.lower() in ['bills & utilities', 'mobile phone', 'internet'])].Amount.sum()
other = transactions.Amount.sum() - food -  flights - hotel - gas - grocery

# Making a dict containing all of the expenses, to be used later
expenses = {
    'flights': flights,
    'hotel':hotel,
    'grocery':grocery,
    'gas':gas,
    'utilities':utilities,
    'restaurants': food,
    'other':other
    
}

expenses

{'flights': 895.4899999999999,
 'hotel': 9.9,
 'grocery': 1803.92,
 'gas': 336.03999999999996,
 'utilities': 0.0,
 'restaurants': 4069.8500000000004,
 'other': 44067.240000000005}

In [10]:
def calculate_rewards(rewards_df, expenses):
    """
    A function to calculate the max rewards for a parsed dataframe.
    TODO: this needs to eventually include net_fee in it's equation.
    
    Inputs:
    - rewards_df (pd.DateFrame), a sliced dataframe with columns
        similar to to_plot
    - expenses (dict), a dictionary containing all of a person's 
        annual expenses
        
    Returns:
    - cash_rewards (float), the annual rewards one would recieve
    """
    cash_rewards = 0
    for category, expense in expenses.items():
        try:
            if isinstance(rewards_df[category], float):
                rate = rewards_df[category] / 100
            else:
                rate = max(rewards_df[category]) / 100
            cash_rewards += rate * expense
        except:
            continue
        
    return cash_rewards

In [11]:
# Given that there are many different combinations of cards, we're 
# going to be solving this stochasticly. 1000 random combination of
# `card_number` of cards will be chosen and will be used to estimate 
# annual rewards. These will be stored to find an approximate max, median
# and min amount of rewards for each number_of_cards.

if os.path.exists('calculated_rewards.pkl') and os.path.exists('cloud.pkl'):
    f = open('calculated_rewards.pkl', 'rb')
    calculated_rewards = pickle.load(f)
    f = open('cloud.pkl', 'rb')
    word_cloud = pickle.load(f)
else:
    calculated_rewards = {} # to keep track of total rewards
    word_cloud = {} # to be used for a wordcloud
    for card_number in range(11):
        # Between 0 and 10 cards
        card_rewards = []
        cloud_list = []

        for i in range(1000):
            choice = np.random.choice(to_plot.index, card_number)
            card_rewards.append(calculate_rewards(to_plot.loc[choice], expenses))
            cloud_list.append((calculate_rewards(to_plot.loc[choice], expenses), choice))
        calculated_rewards[card_number] = card_rewards
        word_cloud[card_number] = cloud_list

    f = open('calculated_rewards.pkl', 'wb')
    pickle.dump(calculated_rewards, f)
    f = open('cloud.pkl', 'wb')
    pickle.dump(word_cloud, f)    