### Project: mode to estimate car values


In [None]:
# import
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Read CSV
df = pd.read_csv("../res/autos.csv", encoding = "iso8859-1")

# Drop some columns
df = df.drop(["dateCrawled", "offerType", "seller", "abtest", "dateCreated", "nrOfPictures", "lastSeen", "postalCode"], axis = 1)

# postal Code could be added using pyGeoDb
# https://pypi.org/project/pyGeoDb/

# Convert year + month to float value
df["monthOfRegistration"] = np.where(df["monthOfRegistration"] == 0, 6, df["monthOfRegistration"])
df["registration"] = df["yearOfRegistration"] + (df["monthOfRegistration"] - 1) / 12

df = df.drop(["yearOfRegistration", "monthOfRegistration"], axis = 1)

df.head()

In [None]:
# Inspect dataset
# print(df["abtest"].unique())
# print("Dataset length: %d" % len(df))
## Seller mostly private > remove
# print(df["seller"].unique())
# df["seller"].describe()
# len(df[df["seller"] == "gewerblich"])
## Offer Type > remove
# df["offerType"].describe()

In [None]:
# Remove vehicles with price zero
df.drop(df[df["price"] == 0].index, axis = 0)

In [None]:
# Remove vehicles with zero PS
df.drop(df[df["powerPS"] == 0].index, axis = 0)

In [None]:
# Remove invalid 'repaired damage' fields
df["notRepairedDamage"].unique()
df["notRepairedDamage"] = np.where(df["notRepairedDamage"] == "ja", "1", df["notRepairedDamage"])
df["notRepairedDamage"] = np.where(df["notRepairedDamage"] == "nein", "0", df["notRepairedDamage"])

df = df[df["notRepairedDamage"].notnull()]
df.head()

In [None]:
# Visualize all data using seaborn to remove outliers
%matplotlib inline
g = sns.pairplot(df, hue="fuelType")

In [None]:
# Remove outlier values seen in pairplot
df = df[(df["price"] < 50000) & (df["powerPS"] < 500) & (df["registration"] <= 2021)]

In [None]:
# Perform One-Hot encoding
df2 = pd.get_dummies(df, columns = ["vehicleType", "gearbox", "fuelType", "brand"]).drop("model", axis = 1)
df2.head()

In [None]:
# Furter increase price range
df2 = df2[(df2["price"] > 500) & (df2["price"] < 20000)]

# Get data
x = df2.drop(["name", "price"], axis = 1)
y = df2["price"]

In [None]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

model = LinearRegression()
model.fit(x_train, y_train)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))