In [1]:
from bs4 import BeautifulSoup
from collections import OrderedDict
import time
import urllib.request as request
import re

def getMakes():
    """Function to get all the makes available at carcomplaints.com"""
    
    url = 'http://www.carcomplaints.com/'
    html = request.urlopen(url)
    
    soup = BeautifulSoup(html, 'lxml')
    sections = soup.find_all('section', id=re.compile('makes'))
    
    make_list = []
    for section in range(len(sections)):
        for li in sections[section].find_all('li'):
            make_list.append(li.a['href'].replace('/',''))
    
    return make_list


def getYearCounts(make, model):
    """Function that returns a Python dict that contains model years and their complaint qty"""
    
    url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'
    html = request.urlopen(url)

    soup = BeautifulSoup(html, 'lxml')
    li = soup.find_all('li', id=re.compile('bar*'))

    year_counts_dict = {}
    for item in li:
        year_counts_dict[int(item.find('span',class_='label').get_text())]=int(item.find('span',class_='count').get_text().replace(",",""))
    
    return year_counts_dict


def getCountsByModel(make):
    """Method that returns the number of complaints for each model based on vehicle make
    Applicable make values are: 'Honda','Acura','Ford','GM',etc
    Method returns a dictionary where the key is the model, value is the qty of complaints"""
    
    url = 'http://www.carcomplaints.com/'
    url_make = url+make+'/'
    html_make = request.urlopen(url_make)
    
    soup = BeautifulSoup(html_make, 'lxml')
    ul = soup.find_all('ul', class_='column bar',id=re.compile('c*'))
    
    make_model_counts_dict = OrderedDict()
    num_column_data = len(ul)  # The data is divided up in arbitrary number of columns per HTML page source
    for i in range(num_column_data):  # For each column of data...
        for row in ul[i].find_all('li'):
            make_model_counts_dict[row.a.get_text().replace(' ','_')] = int(row.span.get_text().replace(",",""))
            
    return make_model_counts_dict


def getTopSystemsQty(make, model, year):
    """Function that returns an OrderedDict containing system problems and their complaint qty"""
    
    url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'
    html = request.urlopen(url)

    soup = BeautifulSoup(html, 'lxml')
    li = soup.find_all('li', id=re.compile('bar*'))
    
    problem_counts_dict = OrderedDict()  # We want to maintain insertion order
    for item in li:
        try:
            problem_counts_dict[item.a['href'][:-1]]=int(item.span.get_text().replace(",",""))
        except:
            pass
        
    return problem_counts_dict


def getNhtsaSystemsQty(make, model, year):
    """Function that returns an OrderedDict containing qty of NHTSA complaints by system"""
    
    url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'
    html = request.urlopen(url)

    soup = BeautifulSoup(html, 'lxml')

    nhtsa = soup.find_all('em', class_='nhtsa')

    nhtsa_counts = []
    for item in nhtsa:
        try:
            # There are 3 string tokens separated by whitespace, i want the 3rd token which is the qty
            nhtsa_counts.append(int(item.span.get_text().split()[2]))
        except:
            # Unfortunately, some only have 2 tokens
            nhtsa_counts.append(int(item.span.get_text().split()[1]))

    systems = soup.find_all('li', id=re.compile('bar*'))

    systems_list = []
    for item in systems:
        systems_list.append(item.a['href'][:-1]) # Remove the ending forward slash

    nhtsa_systems_counts = list(zip(systems_list,nhtsa_counts))
    
    nhtsa_systems_qty_dict = OrderedDict()
    for item in nhtsa_systems_counts:
        nhtsa_systems_qty_dict[item[0]]=item[1]
    
    return nhtsa_systems_qty_dict


def getSubSystemsQty(make, model, year, system):
    """Function that will return an OrderedDict of # of complaints by sub-system"""
    
    url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'+system+'/'
    html = request.urlopen(url)
    soup = BeautifulSoup(html,'lxml')

    li = soup.find_all('li', id=re.compile('bar*'))

    subsystem_counts_dict = OrderedDict()  # We want to maintain insertion order
    for item in li:
        subsystem_counts_dict[item.a['href'].split(".")[0]]=int(item.span.get_text().replace(",",""))
        
    return subsystem_counts_dict


def getReviews(make, model, year, system, subsystem):
    """Function that returns a list of all (maybe) customer reviews
    NOTE: If there are more than 50 reviews, then the reviews are spread out over multiple pages."""
    
    url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'+system+'/'+subsystem+'.shtml'
    html = request.urlopen(url)
    soup = BeautifulSoup(html, 'lxml')

    reviews = soup.find_all('div', 'comments')
    
    complaints = []
    for review in reviews:
        for ps in review.find_all('p'):
            if ps.getText() != 'A D V E R T I S E M E N T S':
                complaints.append(ps.getText())
    
    #####  Read the first page, now check if there are 2 or more pages  #####
    # Get the description so we can then figure out if there are multiple pages
    print("Summary: ", soup.find('meta', attrs={'name': 'description'})['content'], '\n\n')
    num_pages = int(soup.find('meta', attrs={'name': 'description'})['content'].split('Page 1 of ')[1].replace(")", ""))

    if num_pages > 0:
        for page in range(2,num_pages+1):
            print('Parsing page ' + str(page) + ' of ' + str(num_pages))
            url = 'http://www.carcomplaints.com/'+make+'/'+model+'/'+str(year)+'/'+system+'/'+subsystem+'-'+str(page)+'.shtml'
            html = request.urlopen(url)
            soup = BeautifulSoup(html, 'lxml')
            reviews = soup.find_all('div', 'comments')
            for review in reviews:
                for ps in review.find_all('p'):
                    if ps.getText() != 'A D V E R T I S E M E N T S':
                        complaints.append(ps.getText())
    else:
        print("Only one page to parse")
        
    return complaints

In [2]:
getMakes()

['Acura',
 'Audi',
 'BMW',
 'Buick',
 'Cadillac',
 'Chevrolet',
 'Chrysler',
 'Dodge',
 'Ford',
 'GMC',
 'Honda',
 'Hyundai',
 'Infiniti',
 'Isuzu',
 'Jeep',
 'Kia',
 'Lexus',
 'Lincoln',
 'Mazda',
 'Mercedes-Benz',
 'Mercury',
 'Mini',
 'Mitsubishi',
 'Nissan',
 'Oldsmobile',
 'Plymouth',
 'Pontiac',
 'Porsche',
 'Ram',
 'Saab',
 'Saturn',
 'Scion',
 'Subaru',
 'Toyota',
 'Volvo',
 'Volkswagen',
 'Alfa_Romeo',
 'AMC',
 'Bentley',
 'Chery',
 'Daewoo',
 'Datsun',
 'Daihatsu',
 'Eagle',
 'Ferrari',
 'Fiat',
 'Geo',
 'Holden',
 'HSV',
 'Hummer',
 'Jaguar',
 'Kenworth',
 'Lamborghini',
 'Land_Rover',
 'Lotus',
 'Mahindra',
 'Maruti',
 'Maserati',
 'Opel',
 'Peugeot',
 'Renault',
 'Rover',
 'Seat',
 'Skoda',
 'Smart',
 'Ssangyong',
 'Suzuki',
 'Tata',
 'Tesla',
 'Vauxhall',
 'Yugo',
 'Zimmer']

In [3]:
getCountsByModel('Acura')

OrderedDict([('CL', 25),
             ('CSX', 5),
             ('EL', 11),
             ('ILX', 14),
             ('ILX_Hybrid', 0),
             ('Integra', 27),
             ('Legend', 27),
             ('MDX', 261),
             ('NSX', 0),
             ('RDX', 74),
             ('RL', 22),
             ('RLX', 19),
             ('RSX', 20),
             ('SLX', 1),
             ('TL', 308),
             ('TLX', 178),
             ('TSX', 126),
             ('Vigor', 2),
             ('ZDX', 5)])

In [4]:
getYearCounts('Acura', 'TLX')

{2018: 0, 2017: 0, 2016: 12, 2015: 166}

In [5]:
getTopSystemsQty('Acura', 'TLX', 2015)

OrderedDict([('transmission', 119),
             ('accessories-interior', 12),
             ('brakes', 9),
             ('electrical', 7),
             ('AC_heater', 6),
             ('engine', 6),
             ('body_paint', 2),
             ('accessories-exterior', 1),
             ('exhaust_system', 1),
             ('steering', 1),
             ('suspension', 1),
             ('wheels_hubs', 1)])

In [6]:
getSubSystemsQty('Acura', 'TLX', 2015, 'transmission')

OrderedDict([('/Acura/TLX/2015/transmission/jerks_between_gears', 50),
             ('/Acura/TLX/2015/transmission/transmission_jerkiness_in_stop_and_go_traffic',
              21),
             ('/Acura/TLX/2015/transmission/transmission_jerks_and_hesitates_when_cold',
              12),
             ('/Acura/TLX/2015/transmission/hesitates_from_a_stop', 7),
             ('/Acura/TLX/2015/transmission/rpms_fluctuate_hard_shifts', 6),
             ('/Acura/TLX/2015/transmission/surges_forward_parking_slips', 6),
             ('/Acura/TLX/2015/transmission/when_gas_pedal_pressed_car_doesnt_move',
              5),
             ('/Acura/TLX/2015/transmission/not_shifting_properly', 4),
             ('/Acura/TLX/2015/transmission/transmission_failure', 4),
             ('/Acura/TLX/2015/transmission/transmission_grinds', 3),
             ('/Acura/TLX/2015/transmission/slipped_into_drive', 1)])

In [7]:
reviews = getReviews('Acura', 'TLX', 2015, 'transmission', 'jerks_between_gears')

Summary:  The 2015 Acura TLX has 50 problems reported for jerks between gears. Average failure mileage is 4,950 miles. (Page 1 of 3) 


Parsing page 2 of 3
Parsing page 3 of 3


In [8]:
for review in reviews:
    print(review)

When traveling under 20 mph, the vehicle downshifts dramatically causing a slight jerking motion.

				- Albert M.,
				Colton, US

Just got my 2015 TLX in December. After a month of driving I noticed (mostly when it's cold) it slams into gear when accelerating and slams when slowing down. It felt like someone rear ended me the first few times. I saw many other people had the same issue and was wondering if I should get it looked at. Not sure who to go to since it seems Acura isn't doing anything about it. I loved my 2005 TL but transmission left me on the side of the highway (@135000) don't want the same thing happening with this car. Besides this problem I love the car.

				- Samantha V.,
				West Roxbury, US

The main problem is jerkiness between 2-3 and 3-4 gears. Also, the throttle response is really bad. You have to practically floor the gas pedal to get a real response.

				- Tony F.,
				Frederick, MD, US

Having the same issues as everyone here is reporting with jerkiness an