## Importing the relevant libraries

In [1]:
# For this practical example we will need the following libraries and modules
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

## Loading the raw data

In [2]:
# Load the data from a .csv in the same folder
raw_data = pd.read_csv('CarSale.csv')
print(raw_data.shape)

# Let's explore the top 5 rows of the df
raw_data.head()

(4345, 9)


Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
0,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320
1,Mercedes-Benz,7900.0,van,427,2.9,Diesel,yes,1999,Sprinter 212
2,Mercedes-Benz,13300.0,sedan,358,5.0,Gas,yes,2003,S 500
3,Audi,23000.0,crossover,240,4.2,Petrol,yes,2007,Q7
4,Toyota,18300.0,crossover,120,2.0,Petrol,yes,2011,Rav 4


## Common function

In [3]:
# targets = data_preprocessed['log_price']
# inputs = data_preprocessed.drop(['log_price'], axis=1)

def LR_Model(inputs, targets):
    
    # scaling features
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(inputs)
    inputs_scaled = scaler.transform(inputs)
    inputs_scaled = pd.DataFrame(inputs_scaled, columns = inputs.columns)

    # train test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.2, random_state=365)

    # built model   
    from sklearn import ensemble
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    print('Train set: ', model.score(X_train, y_train))
    print('Test set: ',model.score(X_test, y_test))
    
    # Get numerical feature importance
    feature_importance = list(model.feature_importances_)
    feature_labels = X_train.columns
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_labels, feature_importance)]
    # Sort the feature importance by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importance 
    [print('{:50} Importance: {}'.format(*pair)) for pair in feature_importances];
    
# LR_Model(inputs, targets)

In [4]:
def Predicting_Model(num_df, cat_df, targets):
    # scaling features
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(num_df)
    inputs_scaled = scaler.transform(num_df)
    inputs_scaled = pd.DataFrame(inputs_scaled, columns = num_df.columns)
    
    # merge num_df and cat_df
    input_df = pd.concat([inputs_scaled,cat_df],axis=1)

    # train test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(input_df, targets, test_size=0.2, random_state=365)

    # built model   
    from sklearn import ensemble
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    print('Train set: ', model.score(X_train, y_train))
    print('Test set: ',model.score(X_test, y_test))
    
    # Get numerical feature importance
    feature_importance = list(model.feature_importances_)
    feature_labels = X_train.columns
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_labels, feature_importance)]
    # Sort the feature importance by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importance 
    [print('{:50} Importance: {}'.format(*pair)) for pair in feature_importances];   

In [5]:
data = raw_data.copy()

# Dealing with missing values

### List of variables with missing values 

In [6]:
# data.isnull() # shows a df with the information whether a data point is null 
# Since True = the data point is missing, while False = the data point is not missing, we can sum them
# This will give us the total number of missing values feature-wise
data.isnull().sum()

Brand             0
Price           172
Body              0
Mileage           0
EngineV         150
Engine Type       0
Registration      0
Year              0
Model             0
dtype: int64

### Approach1: Removing rows with missing values

In [7]:
# Let's simply drop all missing values
# This is not always recommended, however, when we remove less than 5% of the data, it is okay
bf_removing_missing = data.copy()
data = data.dropna(axis=0)
data.reset_index(drop=True,inplace=True)
data.describe(include='all')

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
count,4025,4025.0,4025,4025.0,4025.0,4025,4025,4025.0,4025
unique,7,,6,,,4,2,,306
top,Volkswagen,,sedan,,,Diesel,yes,,E-Class
freq,880,,1534,,,1861,3654,,188
mean,,19552.308065,,163.572174,2.764586,,,2006.379627,
std,,25815.734988,,103.394703,4.935941,,,6.695595,
min,,600.0,,0.0,0.6,,,1969.0,
25%,,6999.0,,90.0,1.8,,,2003.0,
50%,,11500.0,,158.0,2.2,,,2007.0,
75%,,21900.0,,230.0,3.0,,,2012.0,


# Data Exploration

In [8]:
target = 'Price'
num_cols = ['Mileage','EngineV','Year']

# Preprocessing

### Price

In [9]:
q = data['Price'].quantile(0.99)
# Then we can create a new df, with the condition that all prices must be below the 99 percentile of 'Price'
data_1 = data[data['Price']<q]
data_1.reset_index(drop=True,inplace=True)
# In this way we have essentially removed the top 1% of the data about 'Price'
data_1.describe(include='all')

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
count,3984,3984.0,3984,3984.0,3984.0,3984,3984,3984.0,3984
unique,7,,6,,,4,2,,302
top,Volkswagen,,sedan,,,Diesel,yes,,E-Class
freq,880,,1528,,,1853,3613,,188
mean,,17837.11746,,165.116466,2.74377,,,2006.292922,
std,,18976.268315,,102.766126,4.956057,,,6.672745,
min,,600.0,,0.0,0.6,,,1969.0,
25%,,6980.0,,93.0,1.8,,,2002.75,
50%,,11400.0,,160.0,2.2,,,2007.0,
75%,,21000.0,,230.0,3.0,,,2011.0,


#### Mileage

In [10]:
q = data_1['Mileage'].quantile(0.99)
data_2 = data_1[data_1['Mileage']<q]
data_2.reset_index(drop=True,inplace=True)

#### EngineV

In [11]:
data_3 = data_2[data_2['EngineV']<6.5]
data_3.reset_index(drop=True,inplace=True)

#### Year

In [12]:
# I'll simply remove them
q = data_3['Year'].quantile(0.01)
data_4 = data_3[data_3['Year']>q]
data_4.reset_index(drop=True,inplace=True)

In [17]:
data_cleaned = data_4.copy()
log_price = np.log(data_cleaned['Price'])
data_cleaned['log_price'] = log_price
targets = data_cleaned['log_price']
inputs = data_cleaned[num_cols]
data_cleaned.head()

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model,log_price
0,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,8.34284
1,Mercedes-Benz,7900.0,van,427,2.9,Diesel,yes,1999,Sprinter 212,8.974618
2,Mercedes-Benz,13300.0,sedan,358,5.0,Gas,yes,2003,S 500,9.495519
3,Audi,23000.0,crossover,240,4.2,Petrol,yes,2007,Q7,10.043249
4,Toyota,18300.0,crossover,120,2.0,Petrol,yes,2011,Rav 4,9.814656


In [18]:
LR_Model(inputs, targets)

Train set:  0.9637043603226991
Test set:  0.8163800681286658
Year                                               Importance: 0.59
EngineV                                            Importance: 0.26
Mileage                                            Importance: 0.14


# Categorical variables

In [19]:
data_cleaned.head()

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model,log_price
0,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320,8.34284
1,Mercedes-Benz,7900.0,van,427,2.9,Diesel,yes,1999,Sprinter 212,8.974618
2,Mercedes-Benz,13300.0,sedan,358,5.0,Gas,yes,2003,S 500,9.495519
3,Audi,23000.0,crossover,240,4.2,Petrol,yes,2007,Q7,10.043249
4,Toyota,18300.0,crossover,120,2.0,Petrol,yes,2011,Rav 4,9.814656


### All

In [20]:
cat_vars = ['Brand','Body','Engine Type','Registration','Model']
cat_df = pd.get_dummies(data_cleaned[cat_vars], drop_first=True)
print(cat_df.shape)
cat_df.head()

(3867, 305)


Unnamed: 0,Brand_BMW,Brand_Mercedes-Benz,Brand_Mitsubishi,Brand_Renault,Brand_Toyota,Brand_Volkswagen,Body_hatch,Body_other,Body_sedan,Body_vagon,...,Model_Vito,Model_X1,Model_X3,Model_X5,Model_X5 M,Model_X6,Model_X6 M,Model_Yaris,Model_Z3,Model_Z4
0,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
targets = data_cleaned['log_price']
num_df = data_cleaned[num_cols]
Predicting_Model(num_df, cat_df, targets)

Train set:  0.9904771789687203
Test set:  0.9363432254455195
Year                                               Importance: 0.57
EngineV                                            Importance: 0.21
Registration_yes                                   Importance: 0.07
Mileage                                            Importance: 0.05
Brand_Mercedes-Benz                                Importance: 0.01
Brand_Renault                                      Importance: 0.01
Model_Kangoo                                       Importance: 0.01
Brand_BMW                                          Importance: 0.0
Brand_Mitsubishi                                   Importance: 0.0
Brand_Toyota                                       Importance: 0.0
Brand_Volkswagen                                   Importance: 0.0
Body_hatch                                         Importance: 0.0
Body_other                                         Importance: 0.0
Body_sedan                                         Importance

In [22]:
targets = data_cleaned['Price']
num_df = data_cleaned[num_cols]
Predicting_Model(num_df, cat_df, targets)

Train set:  0.9896558446393972
Test set:  0.9387302805354095
Year                                               Importance: 0.5
EngineV                                            Importance: 0.3
Mileage                                            Importance: 0.06
Brand_Mercedes-Benz                                Importance: 0.03
Brand_BMW                                          Importance: 0.01
Body_sedan                                         Importance: 0.01
Registration_yes                                   Importance: 0.01
Model_Multivan                                     Importance: 0.01
Model_V 250                                        Importance: 0.01
Model_X5                                           Importance: 0.01
Brand_Mitsubishi                                   Importance: 0.0
Brand_Renault                                      Importance: 0.0
Brand_Toyota                                       Importance: 0.0
Brand_Volkswagen                                   Importanc

### Brand

In [23]:
cat_vars = ['Brand']
cat_df = pd.get_dummies(data_cleaned[cat_vars], drop_first=True)
targets = data_cleaned['Price']
num_df = data_cleaned[num_cols]
Predicting_Model(num_df, cat_df, targets)

Train set:  0.9776889946261619
Test set:  0.903950444842313
Year                                               Importance: 0.51
EngineV                                            Importance: 0.32
Mileage                                            Importance: 0.09
Brand_Mercedes-Benz                                Importance: 0.04
Brand_BMW                                          Importance: 0.02
Brand_Toyota                                       Importance: 0.01
Brand_Mitsubishi                                   Importance: 0.0
Brand_Renault                                      Importance: 0.0
Brand_Volkswagen                                   Importance: 0.0


### Body

In [24]:
cat_vars = ['Body']
cat_df = pd.get_dummies(data_cleaned[cat_vars], drop_first=True)
targets = data_cleaned['Price']
num_df = data_cleaned[num_cols]
Predicting_Model(num_df, cat_df, targets)

Train set:  0.9638153244241767
Test set:  0.8710103584783359
Year                                               Importance: 0.52
EngineV                                            Importance: 0.34
Mileage                                            Importance: 0.1
Body_sedan                                         Importance: 0.02
Body_other                                         Importance: 0.01
Body_van                                           Importance: 0.01
Body_hatch                                         Importance: 0.0
Body_vagon                                         Importance: 0.0


### Engine Type

In [25]:
cat_vars = ['Engine Type']
cat_df = pd.get_dummies(data_cleaned[cat_vars], drop_first=True)
targets = data_cleaned['Price']
num_df = data_cleaned[num_cols]
Predicting_Model(num_df, cat_df, targets)

Train set:  0.9599745034644375
Test set:  0.849428064800963
Year                                               Importance: 0.52
EngineV                                            Importance: 0.34
Mileage                                            Importance: 0.12
Engine Type_Petrol                                 Importance: 0.01
Engine Type_Gas                                    Importance: 0.0
Engine Type_Other                                  Importance: 0.0


### Registration

In [26]:
cat_vars = ['Registration']
cat_df = pd.get_dummies(data_cleaned[cat_vars], drop_first=True)
targets = data_cleaned['Price']
num_df = data_cleaned[num_cols]
Predicting_Model(num_df, cat_df, targets)

Train set:  0.9545402030386358
Test set:  0.8386860333301264
Year                                               Importance: 0.53
EngineV                                            Importance: 0.35
Mileage                                            Importance: 0.12
Registration_yes                                   Importance: 0.01


### Model

In [27]:
cat_vars = ['Model']
cat_df = pd.get_dummies(data_cleaned[cat_vars], drop_first=True)
targets = data_cleaned['Price']
num_df = data_cleaned[num_cols]
Predicting_Model(num_df, cat_df, targets)

Train set:  0.9872305323252878
Test set:  0.9296954965609795
Year                                               Importance: 0.5
EngineV                                            Importance: 0.31
Mileage                                            Importance: 0.06
Model_GLE-Class                                    Importance: 0.01
Model_Multivan                                     Importance: 0.01
Model_S 350                                        Importance: 0.01
Model_V 250                                        Importance: 0.01
Model_Viano                                        Importance: 0.01
Model_X5                                           Importance: 0.01
Model_100                                          Importance: 0.0
Model_11                                           Importance: 0.0
Model_116                                          Importance: 0.0
Model_118                                          Importance: 0.0
Model_120                                          Importanc

In [28]:
cat_vars = ['Brand','Body','Engine Type','Registration','Model']
cat_df = pd.get_dummies(data_cleaned[cat_vars], drop_first=True)
selected_vars = ['Brand_Mercedes-Benz','Brand_BMW','Brand_Renault',
                 'Registration_yes','Model_S 350','Model_V 250','Model_X5']
cat_df = cat_df[selected_vars]
targets = data_cleaned['Price']
num_df = data_cleaned[num_cols]
Predicting_Model(num_df, cat_df, targets)

Train set:  0.9808175984418281
Test set:  0.9066871320219378
Year                                               Importance: 0.51
EngineV                                            Importance: 0.32
Mileage                                            Importance: 0.09
Brand_Mercedes-Benz                                Importance: 0.04
Brand_BMW                                          Importance: 0.01
Registration_yes                                   Importance: 0.01
Model_V 250                                        Importance: 0.01
Model_X5                                           Importance: 0.01
Brand_Renault                                      Importance: 0.0
Model_S 350                                        Importance: 0.0
