In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Xây dựng model dự đoán price từ horsepower, curb-weight, engine-size và highway-mpg

In [None]:
df = pd.read_csv('data/automobileEDA.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
features = ['horsepower','curb-weight','engine-size','highway-mpg']

In [None]:
df[features].dtypes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
X = df[features]
y = df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) # seed

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
yhat = lm.predict(X_test)

In [None]:
a = lm.intercept_
b = lm.coef_
a, b

In [None]:
# y = b + a1X1 + a2X2 + a3X3 + a4X4

In [None]:
print('The full R-square is:', lm.score(X,y))
print('The train R-square is:', lm.score(X_train,y_train))
print('The test R-square is:', lm.score(X_test,y_test))
# Với kết quả trên có thể nói rằng ~79% variation của price được giải thích bằng multiple linear này. 
# Kèm theo đó, kết quả so sánh giữa R^2 của cả train và test đều khá cao.
# => Model này khá phù hợp với dữ liệu.

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, yhat)
mae = mean_absolute_error(y_test, yhat)
print('The MSE of price and predicted value is: ', mse)
print('The MAE of price and predicted value is: ', mae)

In [None]:
# plt.figure(figsize=(5,5))
plt.scatter(yhat, y_test)
plt.xlabel('Model Predictions')
plt.ylabel('True Value')
plt.plot([0, 50000], [0, 50000], '-', color = 'r')
plt.show()
# chỉ dự đoán chính xác trong khoảng giá từ 10000 đến 20000

In [None]:
plt.figure(figsize=(7,4))
plt.subplot(1,2,1)
ax1 = sns.distplot(y_train, hist=False, color="r", label="Actual Train")
sns.distplot(lm.predict(X_train), hist=False, color="b", label="Predicted Train", ax=ax1)
plt.legend()
plt.subplot(1,2,2)
ax2 = sns.distplot(y_test, hist=False, color="r", label="Actual Test")
sns.distplot(lm.predict(X_test), hist=False, color="b", label="Predicted Test" , ax=ax2)
plt.legend()
plt.show()

In [None]:
X_new = pd.DataFrame({
    'horsepower': [115,120],
    'curb-weight': [2824,3000],
    'engine-size': [136,150],
    'highway-mpg': [22,18]   
})
# validation

In [None]:
yhat_new = lm.predict(X_new)

In [None]:
yhat_new

## Input thêm 2 biến phân loại: num-of-doors và make

In [None]:
df['num-of-doors'].unique()

In [None]:
df['make'].unique()

In [None]:
from analysis.analyzer import TTTH_Analyzer as Analyzer

In [None]:
_analyzer = Analyzer()
sns.set()

In [None]:
df.rename(columns={'num-of-doors':'num_of_doors'}, inplace=True)

In [None]:
_analyzer.analyze_anova_table_for_continous_vs_categories(continous_var='price', 
                                                          category_vars=['num_of_doors', 'make'], df=df)

In [None]:
sns.barplot(data=df, x='num_of_doors', y='price', ci=None)
plt.show()

In [None]:
sns.barplot(data=df, y='make', x='price', ci=None)
plt.show()

In [None]:
X = df[['horsepower','curb-weight','engine-size','highway-mpg','num_of_doors','make']]
y = df['price']

In [None]:
# encode biến num-of-doors
X['num_of_doors'] = np.where(df['num_of_doors']=='two', 0, 1)

In [None]:
X.head()

In [None]:
# encode biến make
X = pd.get_dummies(X, columns=['make'], prefix='S')

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
print('The full R-square is:', lm.score(X,y))
print('The train R-square is:', lm.score(X_train,y_train))
print('The test R-square is:', lm.score(X_test,y_test))

In [None]:
# tiếp tục vẽ các biểu đồ và cho nhận xét ...