In [92]:
#load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [93]:
#load the jiji scrape dataset
df = pd.read_csv("data/jiji_car_scrap.csv")
df.head()

Unnamed: 0,title,make,model,year,condition,transmission,location,price
0,Land Rover Lr4 V8 2010 Black,Land,Rover,,local used,automatic,Isolo,"₦ 6,500,000"
1,New Toyota Hilux 2024 White,New,Toyota,,new,automatic,Ikeja,"₦ 103,000,000"
2,Toyota Sienna 2012 White,Toyota,Sienna,2012.0,local used,automatic,Ibadan,"₦ 12,700,000"
3,Lexus RX 2015 Black,Lexus,RX,2015.0,foreign used,automatic,Central Business District,"₦ 30,000,000"
4,Mercedes-Benz C350 C350 BlueEFFICIENCY RWD 200...,,,,local used,automatic,Kaduna / Kaduna State,"₦ 9,200,000"


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979 entries, 0 to 1978
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1979 non-null   object 
 1   make          1626 non-null   object 
 2   model         1626 non-null   object 
 3   year          702 non-null    float64
 4   condition     1979 non-null   object 
 5   transmission  1971 non-null   object 
 6   location      1979 non-null   object 
 7   price         1979 non-null   object 
dtypes: float64(1), object(7)
memory usage: 123.8+ KB


In [95]:
#check isnull df value
df.isnull().sum()

title              0
make             353
model            353
year            1277
condition          0
transmission       8
location           0
price              0
dtype: int64

In [96]:
#let's clean the price column
df['price'] = df['price'].astype(str).str.replace(r"[₦,]", "", regex=True).astype(float)
df.head(20)

Unnamed: 0,title,make,model,year,condition,transmission,location,price
0,Land Rover Lr4 V8 2010 Black,Land,Rover,,local used,automatic,Isolo,6500000.0
1,New Toyota Hilux 2024 White,New,Toyota,,new,automatic,Ikeja,103000000.0
2,Toyota Sienna 2012 White,Toyota,Sienna,2012.0,local used,automatic,Ibadan,12700000.0
3,Lexus RX 2015 Black,Lexus,RX,2015.0,foreign used,automatic,Central Business District,30000000.0
4,Mercedes-Benz C350 C350 BlueEFFICIENCY RWD 200...,,,,local used,automatic,Kaduna / Kaduna State,9200000.0
5,Toyota Venza 2013 Gray,Toyota,Venza,2013.0,foreign used,automatic,Ojodu,24500000.0
6,Lexus RX 350 2019 Orange,Lexus,RX,,foreign used,automatic,Alimosho,55000000.0
7,Toyota Hilux 2020 White,Toyota,Hilux,2020.0,local used,automatic,Central Business District,47000000.0
8,Honda Accord 2007 Red,Honda,Accord,2007.0,local used,automatic,Gwarinpa,4000000.0
9,Toyota Corolla LE 2006 Red,Toyota,Corolla,,foreign used,automatic,Ibadan,9500000.0


In [97]:
#use pandas fillna to fill the nan column value
df['make'] = df['make'].fillna(df['title'].str.extract(r"(^\w+)")[0])
df['model'] = df['model'].fillna(df['title'].str.extract(r"^(?P<model>[A-Za-z0-9\-]+)")['model'])
df['year'] = df['year'].fillna(df['title'].str.extract(r"(\d{4})")[0]).astype(float)
# df['model'].head()
df['model'].head(20)

0             Rover
1            Toyota
2            Sienna
3                RX
4     Mercedes-Benz
5             Venza
6                RX
7             Hilux
8            Accord
9           Corolla
10              206
11    Mercedes-Benz
12    Mercedes-Benz
13            Camry
14           Matrix
15          Charger
16           Avalon
17    Mercedes-Benz
18    Mercedes-Benz
19       Highlander
Name: model, dtype: object

In [98]:
#drop subset row
df.dropna(subset=['year', 'transmission'], inplace=True)
df.isnull().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1969 entries, 0 to 1978
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1969 non-null   object 
 1   make          1969 non-null   object 
 2   model         1969 non-null   object 
 3   year          1969 non-null   float64
 4   condition     1969 non-null   object 
 5   transmission  1969 non-null   object 
 6   location      1969 non-null   object 
 7   price         1969 non-null   float64
dtypes: float64(2), object(6)
memory usage: 138.4+ KB


In [101]:
df['location'] = df['location'].str.replace(r"[\s\-/()]+", "", regex=True)
df['location'].unique()

array(['Isolo', 'Ikeja', 'Ibadan', 'CentralBusinessDistrict',
       'KadunaKadunaState', 'Ojodu', 'Alimosho', 'Gwarinpa', 'Jos',
       'AmuwoOdofin', 'Ajah', 'Surulere', 'IkotunIgando', 'Apapa', 'Ojo',
       'Yaba', 'Lekki', 'Katampe', 'Mabushi', 'Uyo', 'Awka', 'Ikorodu',
       'Shomolu', 'Warri', 'OshimiliSouth', 'Gbagada', 'Jahi', 'Magodo',
       'Lokogoma', 'Agege', 'Ogba', 'Asokoro', 'Kubwa', 'Ogudu', 'Owerri',
       'Nyanya', 'LugbeDistrict', 'Kaura', 'AbuleEgba', 'Wuye', 'Garki2',
       'Galadimawa', 'Osogbo', 'Enugu', 'PortHarcourt', 'Onitsha',
       'BeninCity', 'Durumi', 'AdoOdoOta', 'KanoMunicipal', 'ApoDistrict',
       'Gudu', 'Ikoyi', 'Ojota', 'Kado', 'Maryland', 'AbeokutaNorth',
       'EgbeIdimu', 'Utako', 'IlorinEast', 'AbeokutaSouth', 'Kosofe',
       'Ilupeju', 'AdoEkiti', 'AbujaFCT', 'Ipaja', 'IkaSouth', 'Ejigbo',
       'Zaria', 'Wuse2', 'VictoriaIsland', 'Mushin', 'Akure',
       'IlorinSouth', 'Gaduwa', 'Gwagwalada', 'Ikwerre', 'Garki1', 'Jabi',
       'Sa

In [91]:
df.head()

Unnamed: 0,title,make,model,year,condition,transmission,location,price
0,Land Rover Lr4 V8 2010 Black,Land,Rover,2010.0,local used,automatic,Isolo,6500000.0
1,New Toyota Hilux 2024 White,New,Toyota,2024.0,new,automatic,Ikeja,103000000.0
2,Toyota Sienna 2012 White,Toyota,Sienna,2012.0,local used,automatic,Ibadan,12700000.0
3,Lexus RX 2015 Black,Lexus,RX,2015.0,foreign used,automatic,CentralBusinessDistrict,30000000.0
4,Mercedes-Benz C350 C350 BlueEFFICIENCY RWD 200...,Mercedes,Mercedes-Benz,2008.0,local used,automatic,KadunaKadunaState,9200000.0
