# Import data

In [None]:
import pandas as pd
df = pd.read_csv("automobile_prices.csv")
df=df[df["make"] != "ferrari"]
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

## Select only 7 features out of 25, plus one label (=price)

In [None]:
df =df[["make", "body-style", "wheel-base", "engine-size", "horsepower", "peak-rpm", "highway-mpg", "price"]]
df

# Data Cleaning

In [None]:
df.info()

In [None]:
df.head()

In [None]:
print("Dataframe shape BEFORE removing the Nan")
df.info()

print("\n\nDataframe shape AFTER removing the Nan:")
df = df.dropna(subset=['horsepower', 'peak-rpm'])
df.info()

# Split dataset into train set and test set

In [None]:
X = df.drop("price", axis=1)
X

In [None]:
y = df["price"]
y

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
for a in (X_train, X_test, y_train, y_test):
    print (a.shape)

In [None]:
[a.shape for a in (X_train, X_test, y_train, y_test)]

# Encoding

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

non_numerical_cols = ['make', 'body-style']

transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), non_numerical_cols)],
                               remainder='passthrough') # This specifies that all other columns should be kept

X_encoded = transformer.fit_transform(X)

In [None]:
X_encoded.shape # from 7 to 31 columns (or features)

In [None]:
X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
X_train_encoded.shape

# Training!

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train_encoded, y_train)

# Prediction

In [None]:
model.predict(X_train_encoded)[:5]

In [None]:
y_train[:5]

In [None]:
y_test_hat = model.predict(X_test_encoded)
y_test_hat[:5]

In [None]:
y_test[:5]

# Evaluate

In [None]:
# Training set - just for curiosity

y_train_hat = model.predict(X_train_encoded)

from sklearn.metrics import mean_absolute_error, r2_score

# calculate MEA score
mae = mean_absolute_error(y_train, y_train_hat)

# calculate R2 score
r2 = r2_score(y_train, y_train_hat)

print(f"mean absolute error  = {mae}")
print(f"r-squared score  = {r2}")

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

# calculate MEA score
mae = mean_absolute_error(y_test, y_test_hat)

# calculate R2 score
r2 = r2_score(y_test, y_test_hat)

print(f"mean absolute error  = {mae}")
print(f"r-squared score  = {r2}")

# Visualization

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_train, y_train_hat)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values for Train data")

plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, y_test_hat)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values for Test data")

plt.show()