# Car Price Prediction Project


In [2]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

import matplotlib.pyplot as plt
import pandas as pd
# Imports
import numpy as np
#from helper import *
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_log_error
#import category_encoders as ce
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
#------------------------------------------------------------------

## Load dataset and clean
We found there were some missing values we could logically fill, others not.

For example if state value is missing (which is true in 10 records), then use the mileage to set state. If mileage is 0 then set state as New.

There are 8 records with model missing, so we filled them with the most freuent model.

Other than that we fixed some mistakes like there were models 'A5' vs ' A5' (there is a space), so we combined these.

In [None]:
def loadAndClean(filepath):

    df = pd.read_csv(filepath)

    # nan count in each column
    nanCount = df.isnull().sum()
    #print(df.isnull().sum())
    #Model, mileage and state have missing values
    
    # Replace missing models withthe most freuent one, A3
    frequentModel =df['model'].value_counts()[df['model'].value_counts() == df['model'].value_counts().max()].idxmax()
    df['model'] = df['model'].fillna(frequentModel)
    
    #print(df[df['mileage'].isna()])
    # From here we know that 5 missing mileages are of new cars, so put mileage = 0 
    # otherwise put mileage = ave of mileages
    
    df.loc[df.state == 'New', 'mileage'] = 0
    
    # For the 2 remainig Nan in mileage, set mileage as the median (or mean but of the non-zero mileages so median is better)
    df['mileage'] = df['mileage'].fillna(df['mileage'].median())
    
    #For missing states, put as New if mileage == 0, o.w put as used 
    #print(df[df['state'].isna()])
    df.loc[df.mileage == 0, 'state'] = 'New'
    df.loc[df.mileage != 0, 'state'] = 'Used'
    
    #print(df.isnull().sum()) # No more missing
    
    # Assume that A1. is A1, A5. is A5    
    df.loc[df.model == ' A1.', 'model'] = " A1"
    df.loc[df.model == ' A5.', 'model'] = ' A5'
    
    # Mpg has value < 0 so eliminate 
    df.loc[df.mpg <= 0, 'mpg'] = df['mpg'].median()
    
    return df, nanCount, frequentModel

In [4]:
# Load dataset 
data, nanCount, Cm = loadAndClean('train.csv')

### Preprocessing of data 
We want to remove any categorical data.
All of our categorical features are nominal, except for engine size which is ordinal.
For ordinal data we will set the values our selves, for nominal data we will use Sklearn's label encoder.

In [None]:
def loadAndClean(filepath):

    df = pd.read_csv(filepath)

    # nan count in each column
    nanCount = df.isnull().sum()
    #print(df.isnull().sum())
    #Model, mileage and state have missing values
    
    # Replace missing models withthe most freuent one, A3
    frequentModel =df['model'].value_counts()[df['model'].value_counts() == df['model'].value_counts().max()].idxmax()
    df['model'] = df['model'].fillna(frequentModel)
    
    #print(df[df['mileage'].isna()])
    # From here we know that 5 missing mileages are of new cars, so put mileage = 0 
    # otherwise put mileage = ave of mileages
    
    df.loc[df.state == 'New', 'mileage'] = 0
    
    # For the 2 remainig Nan in mileage, set mileage as the median (or mean but of the non-zero mileages so median is better)
    df['mileage'] = df['mileage'].fillna(df['mileage'].median())
    
    #For missing states, put as New if mileage == 0, o.w put as used 
    #print(df[df['state'].isna()])
    df.loc[df.mileage == 0, 'state'] = 'New'
    df.loc[df.mileage != 0, 'state'] = 'Used'
    
    #print(df.isnull().sum()) # No more missing
    
    # Assume that A1. is A1, A5. is A5    
    df.loc[df.model == ' A1.', 'model'] = " A1"
    df.loc[df.model == ' A5.', 'model'] = ' A5'
    
    # Mpg has value < 0 so eliminate 
    df.loc[df.mpg <= 0, 'mpg'] = df['mpg'].median()
    
    return df, nanCount, frequentModel

In [5]:
def preprocess(data):
    df = data
    # TODO: Check model & year
    categ = ["model", "year", "transmission", "fuelType", "tax", "state"]
    
    sizesEngine = {"XXSmall": 1, 'XSmall': 2, 'Small': 3, 'Medium': 4, 'Large': 5, 'XLarge': 6, 'XXLarge': 7}
    df['engineSize'] = df['engineSize'].map(sizesEngine)
    
    # Encode Categorical Columns
    le = LabelEncoder()
    #df["state"] = le.fit_transform(df["state"])
    df[categ] = df[categ].apply(le.fit_transform)

    # Remove ownerName , ID & price cols
    df = df.drop(columns = ["ownerName", "ID"], axis=1)

    return df
# Standarization train, test data
def standarize(Y_train, Y_test):

    Y_train_stand = Y_train.copy()
    Y_test_stand = Y_test.copy()


    # fit on training data column
    scale = StandardScaler().fit(Y_train_stand)

    # transform the training data column
    Y_train_stand = scale.transform(Y_train_stand)

    # transform the testing data column
    Y_test_stand = scale.transform(Y_test_stand)

    return Y_train_stand, Y_test_stand

# Normalization
def normalize(Y_train, Y_test):

    norm = MinMaxScaler().fit(Y_train)

    # transform training data
    Y_train_norm = norm.transform(Y_train)

    # transform testing dataabs
    Y_test_norm = norm.transform(Y_test)

    return Y_train_norm, Y_test_norm



# Detect outliers using z-scores
def detect_outliers_zscore(data):
    outliers = []
    thres = 3
    mean = np.mean(data)
    std = np.std(data)
    # print(mean, std)
    for i in data:
        z_score = (i-mean)/std
        if (np.abs(z_score) > thres):
            outliers.append(i)
    return outliers

# Detect outliers using inter-quartile range
def detect_outliers_iqr(temp):
    outliers = []
    temp = sorted(temp)
    q1 = np.percentile(temp, 25)
    q3 = np.percentile(temp, 75)
    # print(q1, q3)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    # print(lwr_bound, upr_bound)
    for i in temp: 
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
    return outliers

# Detect outliers using botplot (1D Array)
def detect_outliers_boxplot(data):
    plt.boxplot(data, vert=False)
    plt.title("Detecting outliers using Boxplot")
    plt.xlabel('Sample')
    plt.show()

# Handling outliers using Median imputation
def handle_outliers_medium_imputation(data, outliers):
    median = np.median(data)# Replace with median
    clean = []
    for i in outliers:
        clean = np.where(data==i, median, data)
    
    #print('median = ',median)
    return clean


# Handling outliers using trimming
def handle_outliers_trimming(data, outliers):
    copy = data.copy()
    clean = []
    for i in outliers:
        clean = np.delete(copy, np.where(data==i))
    return clean

def detect_handle_outliers_trimming(sample):
    copy = sample.copy()

    # IQR
    Q1 = np.percentile(copy, 25,
                    interpolation = 'midpoint')
    
    Q3 = np.percentile(copy, 75,
                    interpolation = 'midpoint')
    IQR = Q3 - Q1
    
    print("Old Shape: ", copy.shape)

    # Upper bound
    upper = np.where(copy >= (Q3+1.5*IQR))
    # Lower bound
    lower = np.where(copy <= (Q1-1.5*IQR))

    copy.drop(upper[0], inplace = True)
    copy.drop(lower[0], inplace = True)

    return copy


# Recursive Feature Selection with RandomForests
# def rfe_selection(X,y,n_features):
# 	"""
# 	Performs the Recursive Feature Elimination method and selects the top ranking features

# 	Keyword arguments:
# 	X -- The feature vectors
# 	y -- The target vector
# 	n_features -- n best ranked features
# 	"""


# 	clf=RandomForestClassifierWithCoef(n_estimators=10,n_jobs=-1)
# 	fs= RFE(clf, n_features, step=1)
# 	fs= fs.fit(X,y)
# 	ranks=fs.ranking_

# 	feature_indexes=[]
# 	for i in xrange(len(ranks)):
# 		if ranks[i]==1:
# 			feature_indexes+=[i]

# 	return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features]

## Preprocess, remove outliers, and fit to different models to get best accuracy
We observe the data's correlation between price and the features after we preprocessed the data. There are somefeatures that are more significant than other, e.g the important ones are model, year, mpg (negatively correlated).

## Modeling
We attempted 5 different models to predict he car price.
As we fed the data into each model, we tried to pass K features each time, from 1 to 9 features, to see their effect on model accuracy. 
The best accuracy we got was with Random Forest Classifier, and Bayes classifier, both with 9 features.

In [6]:
# Integer encoding & Irrelevant columns removal
data = preprocess(data)
#------------------------------------------------------------------

# Summary
print(data.shape)
#print(data.describe())

#------------------------------------------------------------------
# Outliers Handling - Medium imputation
cols = ['mpg']
for feature in cols:
    # Detect and handle outliers
    outliers = detect_outliers_iqr(data[feature])
    # print('Feature '+feature+'outliers: ', outliers)
    #print('outliers before handling', outliers)
    sample = handle_outliers_medium_imputation(data[feature], outliers)
    outliers = detect_outliers_iqr(sample)
    #print('outliers after handling: ', outliers)
    
#print(outliers)
#print(data.shape)
#------------------------------------------------------------------

# Outliers handling - Trimming
# from scipy import stats
# data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)]

# TODO: Try trimming certain feilds
#------------------------------------------------------------------


# Correlation
corr = data.corr()

print(corr)



# Correlation output:
#   Very low: engineSize, transmission, fuelType
#   Average: model, tax 
#   High: year 
#   NEGATIVE Average: mileage, mpg, state

# output after feature extraction
output = data['price']

# Remove price column
data = data.drop(columns = ['price'], axis=1)


#------------------------------------------------------------------
print(data.shape)

# Loop on k features
K = range(2, 10)
pf_scores = []
svr_rbf_scores = []
svr_linear_scores = []
svr_poly_scores = []
lr_scores = []
dt_scores = []
rf_scores = []
bayes_scores=[]


poly_features_switch = True
svr_rbf_switch = False
svr_linear_switch = True
svr_poly_switch = False
lr_switch = True
dt_switch = True
rf_switch = True
bayes_switch = True
#print(data.head())

for k in K:

    # Convert to numpy array
    X = data.to_numpy()

    # Labels (prices)
    Y = output.to_numpy()

    #------------------------------------------------------------------
    # # Feature Selection
    #f_statistic, p_values = f_regression(X,Y)

    #print(f_statistic) #scores

    #print(p_values) #p values
    # TODO: Try other filtering, wrapping methods
    model = SelectKBest(score_func=f_regression, k=k) 
    #ridge = Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
    #normalize=False, random_state=None, solver='auto', tol=0.001)
    #model = ridge.fit(X,Y)
    # results = model.fit(X,Y)
    
    # print (results.scores_)
    # print (results.pvalues_)

    X = model.fit_transform(X,Y) # New data with top k features

    #------------------------------------------------------------------

    # Reshape to 2d array -> column
    #Y = Y.reshape((len(Y), 1))   

    # Split
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

    # Standarize
    #Y_train_std, Y_test_std = standarize(Y_train, Y_test)

    # Shapes
    #print('Train', X_train.shape, Y_train.shape)
    #print('Test', X_test.shape, Y_test.shape)

    #------------------------------------------------------------------
    # Reshape
    # Y_train_std = Y_train_std.reshape(Y_train_std.shape[0], -1)
    # Y_test_std = Y_test_std.reshape(Y_test_std.shape[0], -1)

    # Classifiers
    # TODO: use scoring='neg_mean_squared_log_error' on crossvalidation
    highest_pipe = None
    highest_score = 0
    if poly_features_switch:
        poly_reg = PolynomialFeatures(degree = k)
        X_poly = poly_reg.fit_transform(X_train)
        poly_features_pipe = Pipeline([('scaler', MinMaxScaler()), ('polynomial_features', LinearRegression() )])

        cv_results = cross_validate(poly_features_pipe, X_train, Y_train, cv=5) # 
        scores = cv_results['test_score']
        score = np.average(scores)
        # poly_features_pipe.fit(X_train, Y_train)
        # score = poly_features_pipe.score(X_test,Y_test)
        pf_scores.append(score)

    if svr_rbf_switch:
        svr_rbf_pipe = Pipeline([('scaler', MinMaxScaler()), ('svc', SVR(kernel='rbf', gamma=0.1))])
        cv_results = cross_validate(svr_rbf_pipe, X_train, Y_train, cv=5) # 
        scores = cv_results['test_score']
        score = np.average(scores)
        # svr_rbf_pipe.fit(X_train, Y_train)
        # score = svr_rbf_pipe.score(X_test,Y_test)
        svr_rbf_scores.append(score)

    # plt.scatter(X_train, Y_train, color='red', label='Actual observation points')
    # plt.plot(X_train, pipe.predict(X_train), label='SVR regressor')
    # plt.title('Truth or bluff (SVR Regression)')
    # plt.xlabel('Position Level')
    # plt.ylabel('Salary')

    # plt.legend()
    # plt.show()

    # Train using a linear kernel
    if svr_linear_switch:
        svr_linear_pipe = Pipeline([('scaler', MinMaxScaler()), ('svc', SVR(kernel='linear'))])
        cv_results = cross_validate(svr_linear_pipe, X_train, Y_train, cv=5) # 
        scores = cv_results['test_score']
        score = np.average(scores)
        #svr_linear_pipe.fit(X_train, Y_train)
        #score = svr_linear_pipe.score(X_test,Y_test)
        svr_linear_scores.append(score)

    # Visualize
    # plt.scatter(X_train, Y_train, color='red', label='Actual observation points')
    # plt.plot(X_train, pipe.predict(X_train), label='SVR regressor')
    # plt.title('Truth or bluff (SVR Regression)')
    # plt.xlabel('Position Level')
    # plt.ylabel('Salary')

    # plt.legend()
    # plt.show()
    if svr_poly_switch:
        # Train using a polynomial kernel
        svr_poly_pipe = Pipeline([('scaler', MinMaxScaler()), ('svc', SVR(kernel='poly', degree=2))])
        cv_results = cross_validate(svr_poly_pipe, X_train, Y_train, cv=5) # 
        scores = cv_results['test_score']
        score = np.average(scores)
        # svr_poly_pipe.fit(X_train, Y_train)
        # score = svr_poly_pipe.score(X_test,Y_test)
        svr_poly_scores.append(score)

    # Visualize
    # plt.scatter(X_train, Y_train, color='red', label='Actual observation points')
    # plt.plot(X_train, pipe.predict(X_train), label='SVR regressor')
    # plt.title('Truth or bluff (SVR Regression)')
    # plt.xlabel('Position Level')
    # plt.ylabel('Price')

    # plt.legend()
    # plt.show()
    if lr_switch:
        lr_pipe = Pipeline([('scaler', MinMaxScaler()), ('LR', LinearRegression())])
        #pipe.fit(np.array(X_train.reshape(-1, 1)), Y_train.reshape(-1, 1))
        cv_results = cross_validate(lr_pipe, X_train, Y_train, cv=5) # 
        scores = cv_results['test_score']
        score = np.average(scores)
        # pipe.fit(X_train, Y_train)
        # #y_predict = pipe.predict(X_test.reshape(-1, 1))
        # score = pipe.score(X_test, Y_test)
        lr_scores.append(score)

    # Visualize
    # plt.scatter(X_train, Y_train, color='red', label='Actual observation points')
    # plt.plot(X_train, pipe.predict(X_train), label='LinearRegression')
    # plt.title('Truth or bluff (LinearRegression)')
    # plt.xlabel('Position Level')
    # plt.ylabel('Price')

    # plt.legend()
    # plt.show()

    if dt_switch:
        dt_pipe = Pipeline([('scaler', MinMaxScaler()), ('LR',  DecisionTreeRegressor(random_state = 0))])
        cv_results = cross_validate(dt_pipe, X_train, Y_train, cv=5) # 
        scores = cv_results['test_score']
        score = np.average(scores)
        # dt_pipe.fit(X_train, Y_train)
        # score = dt_pipe.score(X_test, Y_test)
        dt_scores.append(score)


    # # Visualize
    # plt.scatter(X_train, Y_train, color='red', label='Actual observation points')
    # plt.plot(X_train, pipe.predict(X_train), label='DecisionTreeRegressor')
    # plt.title('Truth or bluff (DecisionTreeRegressor)')
    # plt.xlabel('Position Level')
    # plt.ylabel('Price')

    # plt.legend()
    # plt.show()
    if rf_switch:
        rf_pipe = Pipeline([('scaler', MinMaxScaler()), ('RF',  RandomForestRegressor(n_estimators=300,random_state = 0))])
        cv_results = cross_validate(rf_pipe, X_train, Y_train, cv=5) # 
        scores = cv_results['test_score']
        score = np.average(scores)
        # rf_pipe.fit(X_train, Y_train)
        # score = rf_pipe.score(X_test, Y_test)
        if score > highest_score:
            highest_score = score
            highest_pipe = rf_pipe
        rf_scores.append(score)
    
    if bayes_switch:
        bayes_pipe = Pipeline(steps=[('scaler', MinMaxScaler()), ('Bayes', BayesianRidge())])
        cv_results = cross_validate(bayes_pipe, X_train, Y_train, cv=5) # 
        scores = cv_results['test_score']
        score = np.average(scores)
        # bayes_pipe.fit(X_train, Y_train)
        # score = bayes_pipe.score(X_test, Y_test)
        bayes_scores.append(score)

print('polynomial_features scores: ',pf_scores) # 90
#print('SVR(rbf) scores: ',svr_rbf_scores)
print('SVR(linear) scores: ',svr_linear_scores)
#print('SVR(poly) scores: ',svr_poly_scores)
print('Linear Regression scores: ', lr_scores) #
print('DecisionTreeRegressor scores: ',dt_scores) # 90
print('RandomForestRegressor scores: ',rf_scores) # 90
print('BayesClassifier scores: ',bayes_scores)

print('Highest score: ', max(rf_scores))

# Show accruacies on graphs (K vs classifier scores)
# Uncomment and handle in python the following lines to show graph
# %matplotlib inline 

# plt.plot(K, rf_scores)
# plt.xlabel('rf_scores')
# plt.ylabel('Testing Accuracy')



(7438, 10)
                 model      year     price  transmission   mileage  fuelType  \
model         1.000000  0.146919  0.459745      0.045738 -0.103247 -0.213408   
year          0.146919  1.000000  0.741618      0.035126 -0.785996  0.172324   
price         0.459745  0.741618  1.000000      0.056138 -0.651557  0.028586   
transmission  0.045738  0.035126  0.056138      1.000000 -0.032799  0.083310   
mileage      -0.103247 -0.785996 -0.651557     -0.032799  1.000000 -0.279156   
fuelType     -0.213408  0.172324  0.028586      0.083310 -0.279156  1.000000   
tax           0.439585  0.130854  0.368205      0.042936 -0.176372  0.062920   
mpg          -0.404112 -0.414537 -0.609795     -0.049258  0.426112 -0.345812   
engineSize    0.376159 -0.013678  0.365456     -0.020940  0.123172 -0.598320   
state        -0.069070 -0.222156 -0.227344      0.049052  0.225497 -0.014391   

                   tax       mpg  engineSize     state  
model         0.439585 -0.404112    0.376159 -0.069

## Using model with the test set

In [7]:
# Test set
test_data, nanCount, Cm = loadAndClean('test.csv')

Ids = test_data['ID']

test_data = preprocess(test_data)

model = SelectKBest(score_func=f_regression, k=9) 

X = model.fit_transform(X,Y) # New data with top k features

pipe = Pipeline([('scaler', MinMaxScaler()), ('RF',  RandomForestRegressor(n_estimators=100,random_state = 0))])

pipe.fit(X,Y)

concat = [Ids, pipe.predict(test_data)]


with open('Submission.txt','w') as f:
    for x in zip(*concat):
        f.write("{0},{1}\n".format(*x))



  f"X has feature names, but {self.__class__.__name__} was fitted without"
