In [None]:
## Import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
pd.options.display.float_format = "{:.2f}".format

In [None]:
url = r'https://raw.githubusercontent.com/natthawit-jan/car_price_prediction_proj/master/Car_Purchasing_Data.csv'
df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

Rename the column of gender to is_male, where 1 indicates male and female otherwise

In [None]:
df.rename(columns={'Gender': 'Is_Male'}, inplace=True)

In [None]:
df.head()

In [None]:
d = sns.displot(data=df, x="Car Purchase Amount", col="Is_Male",)
d.axes[0][0].set_title('Female')
d.axes[0][1].set_title('Male')
plt.show()

In [None]:
cols = [name for name in df.select_dtypes('number').columns if name != 'Car Purchase Amount' ]
P = df.corr().drop(columns=cols, index=['Car Purchase Amount', 'Is_Male']).sort_values(by='Car Purchase Amount').T

In [None]:
ax = sns.heatmap(P, annot=True, cbar=False, cmap="YlGnBu")
ax.set_title('Correlation')
ax.set_xticklabels(ax.get_xticklabels(),)
plt.show()

 Correlation between all dataset to see if X are related to each other (Multicorrlinearlity)

In [None]:
sns.heatmap(df.corr(), annot=True, cbar=False, )
plt.show()

In [None]:
sns.pairplot(data=df.drop(columns=['Is_Male']), height=2)

In [None]:
## Drop Country, Customer Name and Customer e-mail columns since they are not relevant to the model
if 'Country' in df.columns:
    df.drop(columns='Country', inplace=True)
if 'Customer Name' in df.columns:
    df.drop(columns='Customer Name', inplace=True)
if 'Customer e-mail' in df.columns:
    df.drop(columns='Customer e-mail', inplace=True)

In [None]:
df.head()

Choose to see the plots for only 3 looking-like linear feature (Age, Net Worth and Annual Salary)

In [None]:
x_elements = ['Age', 'Annual Salary', 'Net Worth']
y_elements = 'Car Purchase Amount'
sns.pairplot(data=df, x_vars=x_elements, y_vars=y_elements, kind="reg", height=3 )

 Therefore, looking from the plots, we can see that there are mainly 3 features that look linear and can be used to train the model. *Age*, *Net Worth* and *Annual Salary*
 At first trial, we will include all the features to our model to see if this will give good results


Define X and y for the model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

### Standardize the data

In [None]:
y_feature = ['Car Purchase Amount']
x_features = ['Age', 'Annual Salary', 'Net Worth', 'Credit Card Debt', 'Is_Male']
data = df[x_features + y_feature]

In [None]:

ct = ColumnTransformer(
    [("Features", StandardScaler(), x_features),
     ("Car price Amount", StandardScaler(), y_feature)
     ], verbose_feature_names_out=False)

p = ct.fit_transform(data)

sub_df = pd.DataFrame(p, columns=ct.get_feature_names_out())
#
X = sub_df[x_features]
y = sub_df[y_feature]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

## Train the model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

### Get all coefficients

In [None]:
print(f'The intercept b0 = {model.intercept_}')


In [None]:
for ind, coeff in enumerate(model.coef_.T):
    print(f'The value of b{ind+1} {X.columns.values[ind]} = {coeff}')
    ind += 1

$ R^2 $ calculation

In [None]:
R_2_for_test_data = model.score(X_test, y_test)
print(f'R2 score for the test data is {R_2_for_test_data}')

Calculate Mean Error on the *test* data

In [None]:
y_pred = model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
X_With_Constant = sm.add_constant(X_train)
sm_model = sm.OLS(y_train, X_With_Constant).fit()
sm_model.summary()


### PLAY WITH THE MODEL

In [None]:
x_transformer = ct.named_transformers_['Features']
y_transformer = ct.named_transformers_['Car price Amount']

AGE = 50
SALARY = 30294
NETWORTH = 349204


x = pd.DataFrame([[AGE, SALARY, NETWORTH]], columns= ['Age', 'Annual Salary', 'Net Worth'])

x_to_predict = x_transformer.transform(x) # Transform the x to standardized x

predicted_amount = model.predict(x_to_predict) # Predict

print(f'Predicted Amount = {y_transformer.inverse_transform(predicted_amount)[0][0]}')


In [None]:
dfs = [pd.Series(y_pred.reshape(-1)), pd.Series((y_pred - y_test.to_numpy()).reshape(-1))]
residual_fitted_df = pd.concat(dfs , axis=1)
residual_fitted_df.columns = ['Fitted', 'Residual']

Plot residual against true values

In [None]:
mean_residual = residual_fitted_df['Residual'].mean()
fig, ax = plt.subplots(figsize=(10, 10))
sns.scatterplot(data=residual_fitted_df, x="Fitted", y="Residual")
plt.plot([-3, 3], [mean_residual, mean_residual],   linewidth=2, color='r')
plt.show()