<a href="https://colab.research.google.com/github/mkm-world/Car-Prices-in-Nigeria/blob/main/Web_Scraping_cars_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web scraping cars
 This notebook contains the scripting for extracting car links and also car info from cars45.com

## Setting up selenium for use in google colab

In [1]:
%%capture
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver/usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

## Importation of required libraries

In [5]:
from time import time
from tqdm import tqdm
import requests
import pandas as pd
from time import sleep
import numpy as np
from selenium.webdriver.common.by import By
import os
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

## Defining my functions for scripting

In [6]:
def get_car_links(page):
  return [k['href'] for k in page.find_all(class_='car-feature')]

def get_car_info(car_link):
  wd.get('https://www.cars45.com'+car_link)
  htm = wd.page_source
  car_page = BeautifulSoup(htm, 'html.parser')
  overview = car_page.find(class_='svg flex')
  car_info = {}
  car_info['car_id'] = car_link[1:]
  car_info['price'] = car_page.find(class_= 'main-details__name').find('h5').text
  for div in overview.find_all('div'):
    prop=div.find('img')['alt']
    car_info[prop]=div.text.strip()
  gen_info=car_page.find(class_='general-info grid')
  for dive in gen_info.find_all('div'):
    prop=dive.find('span').text
    car_info[prop]=dive.find('p').text
  return car_info

In [7]:
def try_href(page):
  try:
    mm=page.find_all(class_='js-handle-click-ctr')[-1]['href']
    kk=True
    print(kk)
  except:
    kk=False
  return kk

In [9]:
stop_page = 50   # stop_page signifies the number of pages you want to scrape 
def extract_all_links():
  k=0
  wd.get('https://www.cars45.com/listing')
  htm = wd.page_source
  soup = BeautifulSoup(htm, 'html.parser')
  links=get_car_links(soup)
  for k in tqdm((range(2,stop_page))):
    try :
      wd.get('https://www.cars45.com/listing/page'+str(k))
      htm = wd.page_source
      soup = BeautifulSoup(htm, 'html.parser')
      links+=get_car_links(soup)
    except:
      print('Sope Otilor')
      break
  return links

## Extracting all car links and their info

In [10]:
all_links=extract_all_links()

100%|██████████| 18/18 [00:31<00:00,  1.77s/it]


In [11]:
cars=[]
for num,link in enumerate(tqdm(all_links)):
  try :
    min_df=pd.DataFrame(get_car_info(link),index=[num])
    cars.append(min_df)
  except:
    print('error with link'+link)

100%|██████████| 285/285 [07:18<00:00,  1.54s/it]


## Converting to Dataframe and Data Cleaning 


In [12]:
df=pd.concat(cars)
df['price']=df['price'].apply(lambda x:int(''.join(c for c in x if c.isdigit())))
df.loc[df.Seats=='5 or 7','Seats']=np.nan
num_cols=['Seats']
for col in num_cols:
  df[col]=df[col].apply(lambda x:float(x))
df.to_csv('/content/drive/MyDrive/car_prices.csv',index=False)
df

Unnamed: 0,car_id,price,car,fuel type,gear type,Make,Model,Year of manufacture,Trim,Colour,Condition,Mileage,Drivetrain,Seats,Number of Cylinders,Engine Size,Horse Power,Selling Condition,Bought Condition,Registered city
0,7v4AXif7Gee1raPtpvK0q88V,6300000,Sedan,Petrol,Automatic,Mercedes-Benz,C-Class,2009,C 300 4MATIC (W204),Black,Foreign Used,52133,All Wheel,5.0,6,3000,231,Imported,Imported,
1,8hYGbcnRyLDbbU8KqjAhIjCe,5250000,Station Wagon,Petrol,Automatic,Scion,xB,2011,Base,Blue,Foreign Used,76692,Front Wheel,5.0,4,2400,158,Imported,Imported,
2,8T2nHeYZFe4Kt48VubKCnhaU,8640000,SUV,Petrol,Automatic,Mercedes-Benz,M Class,2010,,White,Nigerian Used,223912,,,,3500,,Registered,Registered,
3,wIWR1YeUxPYTxXK0gBhJEdRb,5775000,,Petrol,CVT,Toyota,Corolla,2014,,Black,Nigerian Used,131885,,,,1800,,Registered,Imported,LAGOS
4,jGxvOBy0An6aJ3KlLGkNflie,9360000,SUV,Petrol,Automatic,Toyota,Venza,2010,,Brown,Foreign Used,212735,,,,3500,,Imported,Imported,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,r4s3A2Oa0m4yTYk9XVyDOuF2,3150000,,Petrol,Automatic,Hyundai,Sonata,2013,,Silver,Nigerian Used,133844,,,,2400,,Registered,Imported,Lagos
281,kfxA2aU7FrCq5YwrE2eJGfUb,3150000,SUV,Petrol,Automatic,Honda,Pilot,2008,EX 4x4 (3.5L 6cyl 5A),Silver,Nigerian Used,153609,All Wheel,8.0,6,3500,247,Registered,Imported,LAGOS
282,iddmNgwhS4P6QXCb5EOtkBoH,3465000,Sedan,Petrol,Automatic,Toyota,Camry,2007,LE 4dr Sedan (2.4L 4cyl 5A),Gray,Foreign Used,151648,Front,5.0,4,2400,158,Imported,Imported,
283,m7GCqdNJwSIHO5eYvNm5kZxv,3328000,,Petrol,Automatic,Toyota,Camry,2009,,Gray,Nigerian Used,156522,,,,2400,,Registered,Registered,
