In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/automobileEDA.csv')

In [3]:
df.columns

Index(['symboling', 'normalized-losses', 'make', 'aspiration', 'num-of-doors',
       'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length',
       'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',
       'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio',
       'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price',
       'city-L/100km', 'horsepower-binned', 'diesel', 'gas'],
      dtype='object')

In [4]:
from dataprep.clean import clean_headers

In [5]:
df = clean_headers(df)

Column Headers Cleaning Report:
	17 values cleaned (58.62%)


In [6]:
df.shape

(201, 29)

In [7]:
df.columns

Index(['symboling', 'normalized_losses', 'make', 'aspiration', 'num_of_doors',
       'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length',
       'width', 'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_ratio',
       'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price',
       'city_l_100km', 'horsepower_binned', 'diesel', 'gas'],
      dtype='object')

In [8]:
df.dtypes

symboling              int64
normalized_losses      int64
make                  object
aspiration            object
num_of_doors          object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_of_cylinders      object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
city_l_100km         float64
horsepower_binned     object
diesel                 int64
gas                    int64
dtype: object

In [9]:
df.duplicated().any()

False

In [10]:
df.isnull().sum()

symboling            0
normalized_losses    0
make                 0
aspiration           0
num_of_doors         0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_of_cylinders     0
engine_size          0
fuel_system          0
bore                 0
stroke               4
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
city_l_100km         0
horsepower_binned    1
diesel               0
gas                  0
dtype: int64

In [11]:
df['stroke'].fillna(df['stroke'].median(), inplace=True)
df['horsepower_binned'].fillna(df['horsepower_binned'].mode()[0], inplace=True)

In [12]:
# Define a function to remove outliers based on IQR method
def remove_outliers(df, column_name, lower_percentile=0.25, upper_percentile=0.75, threshold=1.5):
    Q1 = df[column_name].quantile(lower_percentile)
    Q3 = df[column_name].quantile(upper_percentile)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

In [13]:
for col in df.select_dtypes(include=['int','float']).columns:
    df = remove_outliers(df, col)

In [14]:
X = df.drop(columns='price')
y = df['price']

In [15]:
num_cols = X.select_dtypes(include=['int','float']).columns # 18 columns

In [16]:
cat_cols = X.select_dtypes(include='object').columns # 10 columns

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Xây dựng ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Xây dựng pipeline kết hợp xử lý dữ liệu và mô hình hóa
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LinearRegression())])

In [20]:
# Huấn luyện mô hình thông qua pipeline
pipeline.fit(X_train, y_train)

In [21]:
# Dự đoán giá trên tập kiểm tra
y_pred = pipeline.predict(X_test)

In [22]:
# Đánh giá mô hình
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 4917868.815609944
R-squared: 0.6785655587239592
