## Project Description : Car Price Prediciton 

## Import Libraries

In [1]:
## for loading and preprocessing 
import pandas as pd
import numpy as np 

## for data visualization 
import matplotlib.pyplot as plt
import seaborn as sns

## splitting dataset
from sklearn.model_selection import train_test_split

## load regression model 
from sklearn.linear_model import LinearRegression

## Loading Data & Creating Copy

In [2]:
## read dataset
data = pd.read_csv("datasets/car_price_prediction.csv")

In [3]:
## create a deep copy 
df = data.copy()

## Data Preview And Understanding
- Loading the dataset

In [4]:
## view the first rows 
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [5]:
## view the last five rows 
df.tail()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
19232,45798355,8467,-,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,300000 km,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5
19233,45778856,15681,831,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600 km,4.0,Tiptronic,Front,04-May,Left wheel,Red,8
19234,45804997,26108,836,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,116365 km,4.0,Automatic,Front,04-May,Left wheel,Grey,4
19235,45793526,5331,1288,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2,51258 km,4.0,Automatic,Front,04-May,Left wheel,Black,4
19236,45813273,470,753,HYUNDAI,Sonata,2012,Sedan,Yes,Hybrid,2.4,186923 km,4.0,Automatic,Front,04-May,Left wheel,White,12


In [6]:
## check the number of rows and columns 
print(f'Total number of rows: {df.shape[0]} -> Total number of columns: {df.shape[1]}')

Total number of rows: 19237 -> Total number of columns: 18


In [7]:
## get a summary description of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [8]:
## checking for missing values
df.isnull().sum()

ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

In [9]:
## checking for duplicated values 
df.duplicated().sum()

313

In [10]:
## checking type on columns
df.dtypes

ID                    int64
Price                 int64
Levy                 object
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume        object
Mileage              object
Cylinders           float64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object

In [11]:
## finding unique instances in each column 
print(np.unique(df['Price']))


[       1        3        6 ...   627220   872946 26307500]


In [12]:
## finding unique instances in each column 
print(np.unique(df['Model']))

['09-Mar' '100' '100 NX' ... 'macan S' 'tC' 'xD']


In [13]:
print(np.unique(df['Mileage']))

['0 km' '1000 km' '10000 km' ... '99999 km' '999999 km' '999999999 km']


## Data preprocessing 
- replacing unsual characters with NaN values
- Drop the Id column
- Normalizing column names
- Remove km from Mileage column
- Filling of missing 

In [14]:
df['Levy'] = df['Levy'].replace('-', np.nan)

In [15]:
df = df.drop(['ID'], axis=1)

In [16]:
##change column names to lower case and replace spaces with underscore
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [17]:
df.columns

Index(['price', 'levy', 'manufacturer', 'model', 'prod._year', 'category',
       'leather_interior', 'fuel_type', 'engine_volume', 'mileage',
       'cylinders', 'gear_box_type', 'drive_wheels', 'doors', 'wheel', 'color',
       'airbags'],
      dtype='object')

In [18]:
## remove km from the mileage column
df['Mileage'] = df['Mileage'].str.replace(' km', '')
df['Mileage'] = df['Mileage'].str.replace('-km', '')

KeyError: 'Mileage'

In [None]:
## convert the mileage column type to float
df['Mileage'] = df['Mileage'].astype('int64')

In [None]:
df['Levy'].isnull().sum()

In [None]:
## converting column type to float
df['Levy'] = df['Levy'].astype('float')

In [None]:
df['Levy'] = df['Levy'].fillna(df['Levy'].mean())

In [None]:
df['Levy'].isnull().sum()

## Exploratory Data Analysis
- statistical summary
- Target variable analysis

In [None]:
df.describe().round()

In [None]:

sns.histplot(df['Price'][df['Price'] < 100000])

plt.title('Histogram Distribution of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.show()

In [None]:
## log transformation 
log_price = np.log1p(df['Price'])

In [None]:
sns.histplot(log_price)

plt.title('Histogram Distribution of the Log Price')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.show()


## Building A Validation Framework / Splitting Dataset
- Training set 60 %
- Validation dataset 20%
- Test dataset 20%


In [None]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)

df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11) 

print(f'Train set {len(df_train)}')
print(f'Valid set {len(df_valid)}')
print(f'Test set {len(df_test)}')


In [None]:
df_train.head()

In [None]:
## selecting target column and converting it to a vector format
y_train = df_train['Price'].values
y_valid = df_valid['Price'].values
y_test = df_test['Price'].values

In [None]:
#y_train

In [None]:
del df_train['Price']
del df_valid['Price']
del df_test['Price']

## Train A Baseline Regression Model
- Selecting baseline features
- Train a regression model

In [None]:
baseline_features = ['Levy', 'Prod. year', 'Cylinders', 'Airbags']

df_train_bl = df_train[baseline_features]

In [None]:
X_train_bl = df_train_bl.values
X_train_bl

In [None]:
## lets train model 
model = LinearRegression() 
model.fit(X_train_bl, y_train)


## Model Evaluation
- Generate the validation predictions
- Compare the validation predictions to the actual predictions
- Return an accuracy score
  