In [None]:
# Goal: To predict the prices of cars using the continuous features of the available data

import pandas as pd
import seaborn as sns
import numpy as np

%matplotlib inline

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

In [None]:
def clean_df(input_df):
    df = input_df
    df.replace('?', None)
    df = df.drop(columns=['normalized-losses'])
    df["horsepower"] = df["horsepower"].apply(lambda x: None if x == '?' else float(x))
    df["peak-rpm"] = df["peak-rpm"].apply(lambda x: None if x == '?' else float(x))
    df["city-mpg"] = df["city-mpg"].apply(lambda x: None if x == '?' else float(x))
    df["highway-mpg"] = df["highway-mpg"].apply(lambda x: None if x == '?' else float(x))
    df["price"] = df["price"].apply(lambda x: None if x == '?' else float(x))
    df = df.dropna()
    return df

In [None]:
train_df = clean_df(train_df)
test_df = clean_df(test_df)

In [None]:
# We choose log of price as the target parameter to handle the skewed data
train_df["log_price"] = np.log(train_df["price"])
test_df["log_price"] = np.log(test_df["price"])

In [None]:
# After exploratory data analysis, we discover the relevant parameters to be taken into consideration
# Introducing new features in the data:
train_df["city-mpg_log"] = np.log(train_df["city-mpg"])
test_df["city-mpg_log"] = np.log(test_df["city-mpg"])

train_df["engine-size_sqroot"] = np.array(train_df["city-mpg"]) ** 0.5
test_df["engine-size_sqroot"] = np.array(test_df["city-mpg"]) ** 0.5

train_df["hp_log_sq"] = np.log(np.array(train_df["horsepower"])) ** 2
test_df["hp_log_sq"] = np.log(np.array(test_df["horsepower"])) ** 2

In [None]:
rel_features = ['log_price', 'city-mpg_log', 'curb-weight', "engine-size_sqroot","wheel-base", "width", "hp_log_sq"]

new_train_df = train_df.filter(rel_features)
new_test_df = test_df.filter(rel_features)

In [None]:
y_train = new_train_df["log_price"]
X_train = new_train_df.drop(["log_price"], axis=1)
y_test = new_test_df["log_price"]
X_test = new_test_df.drop(["log_price"], axis=1)

In [None]:
# importing the required models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
# Implementing linear regression
lin_regressor = LinearRegression()
lin_regressor.fit(X_train, y_train)
lin_regressor.score(X_test, y_test)

In [None]:
# Implmenting random forest regression
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)
rf_regressor.score(X_test, y_test)

In [None]:
# Usinng XGBoost to improve accuracy
xgb_regressor = XGBRegressor()
xgb_regressor.fit(X_train, y_train)
xgb_regressor.score(X_test, y_test)